In [1]:
# jupyter notebook --NotebookApp.allow_origin='https://colab.research.google.com' --port=8888 --NotebookApp.port_retries=0

import numpy as np
import pandas as pd
from os import walk
import os
#from google.colab import drive

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt

#NLP stuff
import string
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import *

In [2]:
df = pd.read_csv('raw_data.csv').set_index(['Year', 'ISO-alpha3 Code'])

In [60]:
def remove_punctuation(text):
    return [word.translate(string.punctuation) for word in text]

def remove_stopwords(text):
    nltk_stop_words = nltk.corpus.stopwords.words('english')
    return [word for word in text if word not in nltk_stop_words]

def remove_nonalpha(text):
    return [word for word in text if word.isalpha()]

def lemmitization(text):
    
    # use the wordnet lemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
    
    #
    nltk_lemmaList = []
    for word in text:
        nltk_lemmaList.append(wordnet_lemmatizer.lemmatize(word))
    return nltk_lemmaList

In [63]:
def preprocess_data(data, preprocess_functions):
    "Performs given preprocessing functions on all text in a given dataframe"
    
    # the initial list to store the processed speeches in
    processed = []
    
    # iterate over all indices of the given dataframe
    for index in data.index:
        
        # get the speech of this index and make it all lower case
        speech = data['Speech'][index].lower()
        
        # skip this speech if it's not a string value
        if not isinstance(speech, str):
            continue
        
        # remove the \t and \n from the speech
        speech = speech.replace("\t", " ").replace("\n", " ")
        
        # tokenize the speech
        tokens = word_tokenize(speech)
        
        # move the tokenized text through the given preprocessing functions
        for function in preprocess_functions:
            tokens = function(tokens)
            
        # create a single string from the preprocessed tokens
        processed_speech = " ".join(tokens)
        
        # deconstruct the index
        [year, code] = index
        
        # add the index and processed speech to this row
        processed.append([year, code, processed_speech])
        
    # convert the preprocessed speeches to a dataframe with the ['Year', 'ISO-aplha3 Code'] index
    return pd.DataFrame(processed, columns=['Year', 'ISO-alpha3 Code', 'Speech']).set_index(['Year', 'ISO-alpha3 Code'])


In [64]:
preprocessed_df = preprocess_data(df, [remove_punctuation, remove_stopwords, remove_nonalpha, spacy_lemmitization])

KeyboardInterrupt: 

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer

def sentiment(df):
    
    # the sentiment analyzer we're going to use
    sia = SentimentIntensityAnalyzer()

    # the initial list to store the sentiment values in
    sentiments = []

    # iterate over all indices of the given dataframe
    for index in df.index:

        # deconstruct the current index
        [year, code] = index

        # get the polarity/sentiment score of the speech
        scores = sia.polarity_scores(df.loc(axis=0)[index]["Speech"])
        
        # get the values from the scores dict
        neg = scores['neg']
        neu = scores['neu']
        pos = scores['pos']
        compound = scores['compound']
        
        # add the index and scores of this speech to the sentiment list
        sentiments.append([year, code, neg, neu, pos, compound])

    # convert the sentiment list to a dataframe with the ['Year', 'ISO-aplha3 Code'] index
    return pd.DataFrame(sentiments, columns=['Year', 'ISO-alpha3 Code', 'Neg', 'Neu', 'Pos', 'Compound']).set_index(['Year', 'ISO-alpha3 Code'])


In [None]:
sentiment_df = sentiment(preprocessed_df)

In [None]:
sentiment_df.aggregate([min, max, np.mean, np.var])

In [42]:
sentiment_df.loc[[ t for t in sentiment_df.index if t[1] == 'USA']].aggregate([min, max, np.mean, np.var])

Unnamed: 0,Neg,Neu,Pos,Compound
min,0.076,0.461,0.213,-0.9997
max,0.315,0.617,0.354,1.0
mean,0.152941,0.556157,0.290804,0.846169
var,0.002006,0.001142,0.001123,0.282661


In [44]:
sentiment_df.to_csv('sentiment.csv')