In [2]:
import numpy as np
import pandas as pd
from os import walk
import os
#from google.colab import drive

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt

#NLP stuff
import string
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import *

# sklearn stuff
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/kaj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
# read the speeches data from the csv (change the path so that it works for you)
df = pd.read_csv('Data/raw_data.csv').set_index(['Year', 'ISO-alpha3 Code'])

# create a subset (this step can be skipped by putting a # in front of it)
df = df.loc(axis=0)[2005:, "USA"]

In [17]:
# create a count vectorizer that 
vectorizer = TfidfVectorizer(stop_words = ['english'], token_pattern=r'\b[^\d\W\_]+\b', max_df = 0.90, min_df = 0.1)

# fit the speech data and transform it into a sparse matrix
X = vectorizer.fit_transform(df['Speech'].to_numpy())

In [18]:
# create a dataframe using the dense count matrix, index of the original speeches data and the features from the vectorizer
count_vect_df = pd.DataFrame(X.todense(), index=df.index, columns=vectorizer.get_feature_names())

In [19]:
# vectorizer.stop_words_
count_vect_df

Unnamed: 0_level_0,Unnamed: 1_level_0,abandon,abbas,abe,abide,abiding,ability,able,about,above,abroad,...,yet,yield,yielded,york,you,young,your,yourself,youth,zero
Year,ISO-alpha3 Code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2005,USA,0.032624,0.0,0.0,0.0,0.0,0.0,0.090948,0.061927,0.0,0.0,...,0.0,0.0,0.0,0.032624,0.0,0.0,0.0,0.0,0.0,0.0
2006,USA,0.044385,0.034865,0.0,0.0,0.0,0.0,0.0,0.009361,0.0,0.0,...,0.018723,0.042874,0.0,0.014795,0.417573,0.009942,0.615594,0.0,0.0,0.0
2007,USA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027923,0.0,0.0,...,0.055846,0.0,0.0,0.0,0.014828,0.014828,0.0,0.0,0.0,0.0
2008,USA,0.0,0.0,0.0,0.0,0.0,0.018787,0.0,0.0,0.0,0.0,...,0.051169,0.0,0.0,0.0,0.013586,0.067931,0.0,0.0,0.0,0.0
2009,USA,0.0,0.017923,0.0,0.0,0.0,0.02827,0.0,0.105871,0.0,0.013186,...,0.038498,0.0,0.02204,0.0,0.05111,0.0,0.013186,0.0,0.0,0.016454
2010,USA,0.0,0.042742,0.0,0.0,0.0,0.0,0.016854,0.05738,0.0,0.015722,...,0.011476,0.0,0.0,0.0,0.012188,0.024376,0.062888,0.0,0.0,0.0
2011,USA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.021646,0.0,0.029655,...,0.032469,0.0,0.0,0.051316,0.0,0.011495,0.0,0.0,0.020155,0.018503
2012,USA,0.0,0.0,0.0,0.0,0.0,0.014489,0.0,0.059194,0.0,0.040548,...,0.019731,0.0,0.0,0.0,0.020956,0.031434,0.0,0.0,0.0,0.0
2013,USA,0.0,0.015722,0.0,0.0,0.019334,0.024799,0.012399,0.025329,0.0,0.0,...,0.0,0.0,0.0,0.013343,0.008967,0.035867,0.0,0.0,0.0,0.014433
2014,USA,0.015438,0.0,0.0,0.022369,0.0,0.014346,0.057384,0.019537,0.0,0.013383,...,0.019537,0.0,0.0,0.0,0.155619,0.13487,0.040148,0.022369,0.036382,0.0


In [20]:
count_vect_df.to_csv('Data/tfidf_usa.csv')