In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import os, time, datetime
from matplotlib import pyplot as plt

News in the website www.intopic.it have a publishing date which is written in italian using a non-standard format. Use the following function to convert it into a datetime.

In [2]:
def extract_date(filename, raw_string):
    raw_string = str(raw_string)
    months = {'Gennaio':'01', 'Febbraio':'02', 'Marzo':'03', 'Aprile':'04', 'Maggio':'05'}

    if "2020" in raw_string:
        creation_date = raw_string.split(' ')
        form_date = str(creation_date[3]) +'-'+ str(months[creation_date[2]]) + '-' + str(creation_date[1])
        #print(form_date)
    else:
        #print(raw_string)
        creation_date = time.ctime(os.path.getctime(filename))
        form_date = str(datetime.datetime.strptime(creation_date, "%a %b %d %H:%M:%S %Y"))
        form_date = form_date.split(' ')[0]
        #print(form_date)
    
    return form_date


Now let us start with the code for our analysis.

In [None]:
# Setup variables
html_root = "/home/marco/workspace/git/StatLearnTeam/web_pages_index/" # Where the html pages are

cols = ['title', 'content', 'date', 'author', 'tags']
articles_df = pd.DataFrame(data = None, columns = cols)



for i in (list(range(1, 2685))[::-1]): # From last page to most recent
    
    webpage_path = html_root + str(i) + ".html"
    html_content = open(webpage_path)
    soup = BeautifulSoup(html_content, 'html.parser') # Open file as a webpage
    
    ################# SINGLE PAGE MINING STARTS HERE #############################
    
    article_section = soup.findAll('div', attrs={"class":"bp-entry"})
    for article in article_section:
        try:
            title = article.find("h2").find("a").getText()
            content = article.find("div", attrs = {"class":"bp-details"}).getText()
            publication_info = article.find("span", attrs = {"class":"author vcard"}) # Date, author...
            
            date = extract_date(webpage_path, str(publication_info.getText()))
            
            author = publication_info.find("span", attrs={"class":"fn"}).getText()
            
            raw_proxy_link = article.find("div", attrs = {"class":"bp-"})
            
            # Not all articles have tags, but should not be a problem getting the other info (always useful)
            # That's why there is a nested try except.
            tags = list()
            try:
                tags_raw = article.find("div", {"class":"tagcloud"}).findAll("a",{'class':'tag-link-10'})
                tags.extend([tag.getText() for tag in tags_raw])
            except:
                pass # Not an article, OR the article does not have any tags
            
            article_entry = pd.DataFrame(data = [[title, content, date, author, tags]], columns = cols)
            articles_df = articles_df.append(article_entry, ignore_index = True)

        except Exception as e:
            pass # Not all html element retrieved are actually article so exceptions could be thrown.
            #print(e)
            #print("=" * 10)
            #print("\nNOT AN ARTICLE!\n")
            #print(article)
            #print("\n" * 3)

Now let's preview the dataset we have generated. 

In [None]:
articles_df

### We shall now get some insight on the content of the articles. Let's look up some virus-related keywords and see if they appear in the articles.

In [None]:
######################## DATAFRAME ANALYSIS STARTS HERE ######################    
contains_virus_count = {}

for i in range(0, len(articles_df)):
    row = articles_df.values[i]
    
    title = row[0]
    content = row[1]
    date = row[2]
    author = row[3]
    tags = row[4]
    
    aggregate_fields = [title, content]
    aggregate_fields.extend(tags)
    
    if date not in contains_virus_count.keys():
        contains_virus_count[date] = 0

    keywords = ['coronavirus', 'covid', 'covid-19']            
    has_keyword = False # Until proven true
    
    for field in aggregate_fields:
        
        if any(k in str.lower(field) for k in keywords):
            has_keyword = True
            
    if has_keyword:
        contains_virus_count[date] = contains_virus_count[date] + 1  # 1 More article contains coronavirus related keywords

In [None]:
contains_virus_count

That is a day by day count of articles that have the chosen keywords just in the title or content preview. 
Now the
# plotting

In [None]:
from scipy.interpolate import interp1d
from scipy.optimize import curve_fit

import numpy as np

plt.figure(figsize=(10, 5))
plt.xticks(rotation=90)       

dates = list(contains_virus_count.keys())
num_of_articles = list(contains_virus_count.values())

sum_articles = 0
total_num_of_articles = []
for v in num_of_articles:
    sum_articles += v
    total_num_of_articles.append(sum_articles)


plt.title("Day by day count - Non cumulative")
plt.xlabel("Day of the month")
plt.ylabel("# of new articles")

plt.scatter(dates, num_of_articles, color='r')
plt.show()

In [None]:
plt.figure(figsize=(10,5))
plt.xticks(rotation=90)
plt.ylabel("Total number of articles available")
plt.title('Day by day count - Cumulative')
plt.scatter(dates, total_num_of_articles)
plt.show()

## Number of articles by news outlet

In [None]:
counter_articles = articles_df.groupby('author').count()

outlets_news_numbers = dict(zip(counter_articles.index.values, counter_articles['title'].values))

outlets_news_numbers = collections.OrderedDict(sorted(outlets_news_numbers.items(), key = lambda x : x[1], reverse = True))

print('There are ', len(outlets_news_numbers.keys()), ' news outlets. \n')

for key in outlets_news_numbers.keys():
    print(key + ' : ', outlets_news_numbers[key])