In [1]:
# Import libraries
import requests
import re
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from datetime import datetime
from bs4 import BeautifulSoup

In [2]:
#Getting web page
WIKI_PORTAL = "https://en.wikipedia.org/wiki/Portal:Current_events/"
#date will be given by tweet data
date = datetime(2016,3,22)
#strftime gives 0-padded days...
year = date.strftime("%Y")
month = date.strftime("%B")
#removing 0 at beggining of day
day = re.sub("^[0]", "", date.strftime("%d"))
#right format for wiki portal and requesting html
url_date = year + "_" + month + "_" + day
r = requests.get(WIKI_PORTAL + url_date)
print('Response status code: {0}\n'.format(r.status_code))
page_body = r.text
soup = BeautifulSoup(page_body, 'html.parser')

Response status code: 200



In [3]:
#daily events are stored in df
event_df = pd.DataFrame(columns=['Date', 'Description', 'Category', 'Link'])

#For every category of events (dt), we look for every events from bullet list (li)
for category in soup.findAll("dt"):
    li = category.findNext("li")
    while li:
        #replacing \n, and removing sources: text. (CNN). 
        #we decided to split text using regex and keeping only text before first source
        full_text = re.split(".\s\(", li.getText().replace('\n', '. '))
        no_source = full_text[0]
        new_event = pd.DataFrame({'Date': [date], 'Description': [no_source], 'Category': [category.getText()], 'Link': [li.a]})
        event_df = event_df.append(new_event, ignore_index=True)
        li = li.findNextSibling("li")

In [4]:
event_df.head()

Unnamed: 0,Date,Description,Category,Link
0,2016-03-22,2016 Brussels bombings. Three explosions in th...,Armed conflicts and attacks,"<a class=""mw-redirect"" href=""/wiki/2016_Brusse..."
1,2016-03-22,Saudi Arabian-led intervention in Yemen. Airst...,Armed conflicts and attacks,"<a class=""mw-redirect"" href=""/wiki/Saudi_Arabi..."
2,2016-03-22,German tax authorities are investigating Citig...,Business and economics,"<a href=""/wiki/Germany"" title=""Germany"">German..."
3,2016-03-22,2016 Brussels bombings. In light of the Belgia...,Business and economics,"<a class=""mw-redirect"" href=""/wiki/2016_Brusse..."
4,2016-03-22,Colombian conflict. U.S. Secretary of State Jo...,International relations,"<a href=""/wiki/Colombian_conflict"" title=""Colo..."


In [5]:
#We will create a new col of stemmed words to compare with tweets keywords
link_numbers=('http', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-') #
stemmer = PorterStemmer()
stop_w=[word.replace('\'','') for word in stopwords.words('english')]+ ['', '&amp', 'amp','rt'] 

def tokenize(text):
    """Split the the tweet into a list of (cleaned words)"""
    text_cleaned = ''.join(ch for ch in text if ch not in '#!"$%&\()*+,./:;<=>?@[\\]^_{|}~\'').split(' ') 
    
    words= [word.lower().encode('ascii',errors='ignore').decode() for word in text_cleaned \
            if not ( (word.startswith(link_numbers)) | (word.endswith(link_numbers)) )]
    
    words=[stemmer.stem(word) for word in words if word not in stop_w ]
        
    if len(words) > 0:
        return words
    else: #tweets that contains only links or emojiis ...
        pass
    
event_df['Stemmed_Content']=event_df.apply(lambda row: tokenize(row['Description']), axis=1)

In [6]:
event_df.head(5)

Unnamed: 0,Date,Description,Category,Link,Stemmed_Content
0,2016-03-22,2016 Brussels bombings. Three explosions in th...,Armed conflicts and attacks,"<a class=""mw-redirect"" href=""/wiki/2016_Brusse...","[brussel, bomb, three, explos, brussel, airpor..."
1,2016-03-22,Saudi Arabian-led intervention in Yemen. Airst...,Armed conflicts and attacks,"<a class=""mw-redirect"" href=""/wiki/Saudi_Arabi...","[saudi, arabian-l, intervent, yemen, airstrik,..."
2,2016-03-22,German tax authorities are investigating Citig...,Business and economics,"<a href=""/wiki/Germany"" title=""Germany"">German...","[german, tax, author, investig, citigroup, use..."
3,2016-03-22,2016 Brussels bombings. In light of the Belgia...,Business and economics,"<a class=""mw-redirect"" href=""/wiki/2016_Brusse...","[brussel, bomb, light, belgian, attack, warner..."
4,2016-03-22,Colombian conflict. U.S. Secretary of State Jo...,International relations,"<a href=""/wiki/Colombian_conflict"" title=""Colo...","[colombian, conflict, us, secretari, state, jo..."
