# Dependencies

In [1]:
import pandas as pd
import csv
import requests
from requests import get
from bs4 import BeautifulSoup
from mtranslate import translate
from pprint import pprint
from time import sleep
from time import time
from random import randint
import matplotlib.pyplot as plt

# Scraping

Reference: https://www.dataquest.io/blog/web-scraping-beautifulsoup/

In [None]:
# Scrape Wykop for all posts with the mention of "smog"

posts = []
votes = []
dates = []
images = []
users = []

start_time = time()
request_count = 0
req_sess = requests.Session()

for page_num in range(1, 163):
# for page_num in enumerate(response): #I believe this method will scrape not just posts but also comments

    response = req_sess.get(f"https://www.wykop.pl/szukaj/wpisy/smog/strona/{page_num}/")

    # Pause the loop
    #sleep(randint(1,3))

    # Monitor the requests
    request_count += 1
    elapsed_time = time() - start_time
    print('Page {}; Request:{}; Frequency: {} requests/s'.format(page_num, request_count, request_count/elapsed_time))

    #clear_output(wait = True)
    # Throw a warning for non-200 status codes
    if response.status_code != 200:
        print('Request: {}; Status code: {}'.format(requests, response.status_code))
        print(response.headers)

    # Break the loop if the number of requests is greater than expected
    #if requests > 10:
    #    print('Number of requests was greater than expected.')
    #    break

    soup = BeautifulSoup(response.text, 'html.parser')
    results = soup.find_all('li', class_="entry iC")

    for result in results:
        # Error handling
        try:
            post = result.find('div', class_="text").text
            posts.append(post)

            date = result.time['title']
            dates.append(date)

            vote = result.p.b.span.text
            vote = int(vote)
            votes.append(vote)

            user = result.div.b.text
            users.append(user)

            image = result.find('img',class_='block lazy')
            images.append(image)

        except AttributeError as e:
            print(e)

In [None]:
wykopDF1 = pd.DataFrame({'post':posts,
                       'date': dates,
                         'user':users,
                       'vote': votes,
                        'image':images
})
# wykopDF1.to_csv('rawWebScraps_Wykop/wykopRawV2.csv')
print(wykopDF1.info())

wykopDF1.tail()

# Data Cleaning

<b>Note</b>: I'm removing "html residual" elements manually such as \n and \t.
<br><br>
<b>Note #2</b>: Because I ran the script on two separate occassions (March and May 2019), I am concatenating both dataframes and dropping duplicated rows (i.e. scraped posts) from the May scraping.
<br><br>
<b>Note #3</b>: Couldn't exactly figure out how to loop the google translation module (mtranslate) to translate all posts from Polish to English into a new column :(

In [None]:
# from mtranslate import translate as mtranslate
import re

data = pd.read_csv("rawWebScraps_Wykop/wykopRaw.csv")
data2 = pd.read_csv("rawWebScraps_Wykop/wykopRawMay2019.csv")


del data['Unnamed: 0']
data['post'] = data['post'].str.strip()
data['post'] = data['post'].replace(r'\n', '', regex = True)
data['post'] = data['post'].replace(r'\t', '', regex = True)
del data2['Unnamed: 0']
data2['post'] = data2['post'].str.strip()
data2['post'] = data2['post'].replace(r'\n', '', regex = True)
data2['post'] = data2['post'].replace(r'\t', '', regex = True)
# data['postEn'] = ''
data['date'] = pd.to_datetime(data['date'])
data2['date'] = pd.to_datetime(data2['date'])


data2.tail() #7288
data.tail()  #7287

dataDF = pd.concat([data,data2]).drop_duplicates().reset_index(drop=True)
dataDF = dataDF.sort_values(by = 'date')
dataDF['Month-Year']=dataDF.date.apply(lambda x: str(x)[:7])
dataDF.to_csv('rawWebScraps_Wykop/wykop2012_18.csv')
dataDF.head()

# Analysis

In [2]:
dataDF = pd.read_csv("raw/wykop2012_18.csv")
del dataDF['Unnamed: 0']
dataDF['date'] = pd.to_datetime(dataDF['date'])

polair = pd.read_csv("raw/airPoland2017.csv")

In [None]:
## Filter DF to Jan. 2017
mask = (dataDF['Month-Year'] == '2017-01')
Jan2017 = dataDF.loc[mask]
sum(Jan2017.post.str.len())
Jan2017['Days'] = Jan2017.date.apply(lambda x: str(x)[8:10])
Jan2017['Krakowcount'] = Jan2017.post.str.count('krakow')
Jan2017['Warszawacount'] = Jan2017.post.str.count('warszawa')

krakowCount = Jan2017.resample('D', on='date').sum().reset_index()
WarszawaCount = Jan2017.resample('D', on='date').sum().reset_index()
WarszawaCount
# Jan2017.head()

In [None]:
Jan2017_hash_count = pd.DataFrame(Jan2017.post.str.extractall(r'(\#\w+)')[0].value_counts().reset_index().values, columns=["hashtag", "count"])
Jan2017_hash_count = Jan2017_hash_count.sort_index(axis = 0, ascending=True)
Jan2017_hash_count['hashtag'] = Jan2017_hash_count['hashtag'].replace({'#':''}, regex=True)
Jan2017_hash_count.dtypes
Jan2017_hash_count.head()

In [None]:
polair = pd.read_csv("raw/airPoland2017.csv") # Source: http://powietrze.gios.gov.pl/pjp/archives
                                                # Index source: https://airnow.gov/index.cfm?action=airnow.international
polair['date'] = pd.to_datetime(polair['date'])

polair = polair.replace(r',', '.', regex = True)
polair.Bulwarowa = polair.Bulwarowa.replace(r' ', 'NaN', regex = True)

polair.Bulwarowa = polair.Bulwarowa.astype(float)
polair.Bujaka = polair.Bujaka.astype(float)
polair['Aleja Krasińskiego'] = polair['Aleja Krasińskiego'].astype(float)



polair.dtypes
polair_d = polair.resample('D', on='date').mean().reset_index()
polair_d['avg'] = polair_d.mean(axis=1)
polair_d

In [None]:
Jan2017Df = Jan2017.groupby(Jan2017['Days']).count()
Jan2017Df = Jan2017Df['post']

with plt.xkcd():
    
    ax = plt.gca()
    
    Jan2017Df.plot(x='date', y='',kind="bar",figsize=(20,10))
    krakowCount.plot(kind='line',y='Krakowcount',color="green",ax=ax)
    WarszawaCount.plot(kind='line',y='Warszawacount',color="orange",ax=ax)
    polair_d.plot(kind='line',y='avg',color="red",ax=ax)

In [None]:
start_date = '2017-01-08'
end_date = '2017-01-12'
mask = (Jan2017['date'] >= start_date) & (Jan2017['date'] <= end_date)
Jan2017_filtered = Jan2017.loc[mask]
print(sum(Jan2017_filtered.post.str.len()))
Jan2017_filtered.head()

Jan2017_f_h = pd.DataFrame(Jan2017_filtered.post.str.extractall(r'(\#\w+)')[0].value_counts().reset_index().values, columns=["hashtag", "count"])
Jan2017_f_h = Jan2017_f_h.sort_index(axis = 0, ascending=True)
Jan2017_f_h['hashtag'] = Jan2017_f_h['hashtag'].replace({'#':''}, regex=True)
Jan2017_f_h

## Images

In [None]:
import pandas as pd
from IPython.display import Image, HTML

def path_to_image_html(path):
    '''
     This function essentially convert the image url to 
     '<img src="'+ path + '"/>' format. And one can put any
     formatting adjustments to control the height, aspect ratio, size etc.
     within as in the below example. 
    '''

    return '<img src="'+ path + '" style=max-height:124px;"/>'

HTML(dataDF.to_html(escape=False ,formatters=dict(image=path_to_image_html)))

## Timeseries of posts mentioning 'Smog' from 2012 to 2019

In [None]:
countDF = dataDF.groupby(dataDF['Month-Year']).count()
countDF = countDF['post']

with plt.xkcd():
    plt.axvspan(47,50, facecolor='skyblue', alpha=0.25)
    plt.axvspan(35,38, facecolor='skyblue', alpha=0.25)
    plt.axvspan(59,62, facecolor='skyblue', alpha=0.25)
    plt.axvspan(71,74, facecolor='skyblue', alpha=0.25)
    plt.axvspan(23,26, facecolor='skyblue', alpha=0.25)        
    plt.axvspan(11,14, facecolor='skyblue', alpha=0.25)        
    plt.axvspan(0,2, facecolor='skyblue', alpha=0.25)
 
    countDF.plot(x='Month-Year', y='',kind="bar",figsize=(20,10))

## Word Frequency Analysis

In [None]:
words = dataDF.post.str.split(expand=True).stack().value_counts()


In [None]:
from mtranslate import translate

hash_count = pd.DataFrame(dataDF.post.str.extractall(r'(\#\w+)')[0].value_counts().reset_index().values, columns=["hashtag", "count"])
hash_count = hash_count.sort_index(axis = 0, ascending=True)
hash_count['hashtag'] = hash_count['hashtag'].replace({'#':''}, regex=True)
hash_count.dtypes
# hash_count['hashtag'] = mtranslate(hash_count['hashtag'],'en','pl')
hash_count.head()

In [None]:
mtranslate(data.post[5],'en','pl')

In [None]:
sum(data['post'].str.len())

In [None]:
data['date'] = pd.to_datetime(data['date'])

data.dtypes
data.resample('Y', on='date').count()

In [None]:
# translation

from IPython.display import clear_output
import numpy as np
from time import sleep
from mtranslate import translate

translation = []

for postPl in range(0,7286):
    
    if data.post.str.len()[postPl] == 99999:
    
        sleep(randint(101,102))
    
    else:
        clear_output(wait=True)
        trans = mtranslate(data.post[postPl],'en','pl')
        translation.append(trans)
    
    print("Current progress: ", np.round(postPl/len(data)*100,2),"%")
    
%time

In [None]:
for i,row in data.iterrows():
    
    
    
    print("Current progress: ", np.round(postPl/len(data)*100,2),"%")
    
%time


In [None]:
translation

Show image in dataframe: https://datascience.stackexchange.com/questions/38083/display-images-url-inside-pandas-dataframe

https://stackoverflow.com/questions/37365824/pandas-ipython-notebook-include-and-display-an-image-in-a-dataframe

Vader: https://medium.com/analytics-vidhya/simplifying-social-media-sentiment-analysis-using-vader-in-python-f9e6ec6fc52f

Simple WordCloud: http://kavita-ganesan.com/word-cloud-for-data-scientists/