In [1]:
# general stuff
import numpy as np
import pandas as pd
from os import walk
import os

# NLP stuff
import string
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
# load the datasets
df = pd.read_csv('Data/raw_data.csv')
df_codes = pd.read_csv('Data/UNSD_Methodology.csv')

# merge the datasets
df_merged = pd.merge(df_codes, df, on = 'ISO-alpha3 Code')

# only keep a couple of rows that could help with indexing or merging and the speeches themselves
df_merged = df_merged[["Country or Area", "ISO-alpha3 Code", "Session", "Year", "Speech"]]

# rename the country name column for possible merging with other datasets and set the muti index
df_merged = df_merged.rename(columns={"Country or Area": "Country"})
df_merged = df_merged.set_index(["Year", "ISO-alpha3 Code"])

# sort the rows on the index and only keep the data of 2005 till now
df = df_merged.sort_index(level=0)
df = df.loc[2005:]

In [3]:
def remove_punctuation(text):
    return [word.translate(string.punctuation) for word in text]

def remove_stopwords(text):
    nltk_stop_words = nltk.corpus.stopwords.words('english')
    return [word for word in text if word not in nltk_stop_words]

def lemmitization(text):
    
    # use the wordnet lemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
    
    # only return the lemmatized words
    nltk_lemmaList = []
    for word in text:
        nltk_lemmaList.append(wordnet_lemmatizer.lemmatize(word))
    return nltk_lemmaList

In [4]:
def preprocess_data(data, preprocess_functions):
    "Performs given preprocessing functions on all text in a given dataframe"
    
    # the initial list to store the processed speeches in
    processed = []
    
    # iterate over all indices of the given dataframe
    for index in data.index:
        
        # get the speech of this index and make it all lower case
        speech = data['Speech'][index].lower()
        
        # skip this speech if it's not a string value
        if not isinstance(speech, str):
            continue
        
        # remove the \t and \n from the speech
        speech = speech.replace("\t", " ").replace("\n", " ")
        
        # tokenize the speech
        tokens = word_tokenize(speech)
        
        # move the tokenized text through the given preprocessing functions
        for function in preprocess_functions:
            tokens = function(tokens)
            
        # create a single string from the preprocessed tokens
        processed_speech = " ".join(tokens)
        
        # deconstruct the index
        [year, code] = index
        
        # add the index and processed speech to this row
        processed.append([year, code, processed_speech])
        
    # convert the preprocessed speeches to a dataframe with the ['Year', 'ISO-aplha3 Code'] index
    return pd.DataFrame(processed, columns=['Year', 'ISO-alpha3 Code', 'Speech']).set_index(['Year', 'ISO-alpha3 Code'])


In [5]:
# preprocess the data (only the lemmitization)
preprocessed_df = preprocess_data(df, [remove_stopwords, remove_punctuation, lemmitization])

In [6]:
from nltk.sentiment import SentimentIntensityAnalyzer

def sentiment(df):
    
    # the sentiment analyzer we're going to use
    sia = SentimentIntensityAnalyzer()

    # the initial list to store the sentiment values in
    sentiments = []

    # iterate over all indices of the given dataframe
    for index in df.index:

        # deconstruct the current index
        [year, code] = index

        # get the polarity/sentiment score of the speech
        scores = sia.polarity_scores(df.loc(axis=0)[index]["Speech"])
        
        # get the values from the scores dict
        neg = scores['neg']
        neu = scores['neu']
        pos = scores['pos']
        compound = scores['compound']
        
        # add the index and scores of this speech to the sentiment list
        sentiments.append([year, code, neg, neu, pos, compound])

    # convert the sentiment list to a dataframe with the ['Year', 'ISO-aplha3 Code'] index
    return pd.DataFrame(sentiments, columns=['Year', 'ISO-alpha3 Code', 'Neg', 'Neu', 'Pos', 'Compound']).set_index(['Year', 'ISO-alpha3 Code'])


In [7]:
# calculate the sentiment of the speeches
sentiment_df = sentiment(preprocessed_df)

In [8]:
# merge the speeches and sentiment dataframes
merged_df = preprocessed_df.merge(sentiment_df, left_index = True, right_index = True)

In [9]:
# store the merged dataframe as a csv 
merged_df.to_csv('Data/speeches_sentiment.csv')