In [1]:
import numpy as np
import pandas as pd
from os import walk
import os

#NLP stuff
import string
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
df = pd.read_csv('Data/raw_data.csv').set_index(['Year', 'ISO-alpha3 Code'])
df = df.loc[2005:]

In [3]:
def remove_nonalpha(text):
    return [word for word in text if word.isalpha()]

def lemmitization(text):
    
    # use the wordnet lemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
    
    # 
    nltk_lemmaList = []
    for word in text:
        nltk_lemmaList.append(wordnet_lemmatizer.lemmatize(word))
    return nltk_lemmaList

In [4]:
def preprocess_data(data, preprocess_functions):
    "Performs given preprocessing functions on all text in a given dataframe"
    
    # the initial list to store the processed speeches in
    processed = []
    
    # iterate over all indices of the given dataframe
    for index in data.index:
        
        # get the speech of this index and make it all lower case
        speech = data['Speech'][index].lower()
        
        # skip this speech if it's not a string value
        if not isinstance(speech, str):
            continue
        
        # remove the \t and \n from the speech
        speech = speech.replace("\t", " ").replace("\n", " ")
        
        # tokenize the speech
        tokens = word_tokenize(speech)
        
        # move the tokenized text through the given preprocessing functions
        for function in preprocess_functions:
            tokens = function(tokens)
            
        # create a single string from the preprocessed tokens
        processed_speech = " ".join(tokens)
        
        # deconstruct the index
        [year, code] = index
        
        # add the index and processed speech to this row
        processed.append([year, code, processed_speech])
        
    # convert the preprocessed speeches to a dataframe with the ['Year', 'ISO-aplha3 Code'] index
    return pd.DataFrame(processed, columns=['Year', 'ISO-alpha3 Code', 'Speech']).set_index(['Year', 'ISO-alpha3 Code'])


In [5]:
preprocessed_df = preprocess_data(df, [lemmitization])

In [6]:
from nltk.sentiment import SentimentIntensityAnalyzer

def sentiment(df):
    
    # the sentiment analyzer we're going to use
    sia = SentimentIntensityAnalyzer()

    # the initial list to store the sentiment values in
    sentiments = []

    # iterate over all indices of the given dataframe
    for index in df.index:

        # deconstruct the current index
        [year, code] = index

        # get the polarity/sentiment score of the speech
        scores = sia.polarity_scores(df.loc(axis=0)[index]["Speech"])
        
        # get the values from the scores dict
        neg = scores['neg']
        neu = scores['neu']
        pos = scores['pos']
        
        # add the index and scores of this speech to the sentiment list
        sentiments.append([year, code, neg, neu, pos])

    # convert the sentiment list to a dataframe with the ['Year', 'ISO-aplha3 Code'] index
    return pd.DataFrame(sentiments, columns=['Year', 'ISO-alpha3 Code', 'Neg', 'Neu', 'Pos']).set_index(['Year', 'ISO-alpha3 Code'])


In [7]:
sentiment_df = sentiment(preprocessed_df)

In [8]:
sentiment_df.aggregate([min, max, np.mean, np.var])

Unnamed: 0,Neg,Neu,Pos
min,0.011,0.582,0.072
max,0.272,0.869,0.309
mean,0.08183,0.727387,0.190792
var,0.000935,0.001515,0.001031


In [9]:
# merge the datasets
merged_df = preprocessed_df.merge(sentiment_df, left_index = True, right_index = True)

In [10]:
merged_df.to_csv('Data/speeches_sentiment.csv')