# Layout

+ Intro algebra
    + news
    + physics
+ doc2vec
    + press releases
+ score function
    + gensim
+ dimensions example, e.g. he-she

In [155]:
#All these packages need to be installed from pip
import gensim#For word2vec, etc
import requests #For downloading our datasets
import nltk #For stop words and stemmers
import numpy as np #For arrays
import pandas #Gives us DataFrames
import matplotlib.pyplot as plt #For graphics
import seaborn #Makes the graphics look nicer
import random

#This 'magic' command makes the plots work better
#in the notebook, don't use it outside of a notebook.
#Also you can ignore the warning
%matplotlib inline

import os #For looking through files
import os.path #For managing file paths


# Intro

intro stuff ...

# Getting our corpuses

Instead of downloading our corpora, we have download them ahead of time, a subset of the [senate press releases](https://github.com/lintool/GrimmerSenatePressReleases) are in `data/grimmerPressReleases`. So we will load them into a DataFrame, to do this first we need to define a function to convert directories of text files into DataFrames.

In [2]:
def loadDir(targetDir, category):
    allFileNames = os.listdir(targetDir)
    #We need to make them into useable paths and filter out hidden files
    filePaths = [os.path.join(targetDir, fname) for fname in allFileNames if fname[0] != '.']

    #The dict that will become the DataFrame
    senDict = {
        'category' : [category] * len(filePaths),
        'filePath' : [],
        'text' : [],
    }

    for fPath in filePaths:
        with open(fPath) as f:
            senDict['text'].append(f.read())
            senDict['filePath'].append(fPath)

    return pandas.DataFrame(senDict)

Now we can use the function in all the directories in `data/grimmerPressReleases`

In [4]:
dataDir = 'data/grimmerPressReleases'

senReleasesDF = pandas.DataFrame()

for senatorName in [d for d in os.listdir(dataDir) if d[0] != '.']:
    senPath = os.path.join(dataDir, senatorName)
    senReleasesDF = senReleasesDF.append(loadDir(senPath, senatorName), ignore_index = True)

senReleasesDF[:100:10]

Unnamed: 0,category,filePath,text
0,Kennedy,data/grimmerPressReleases\Kennedy\01Apr2005Ken...,FOR IMMEDIATE RELEASE FOR IMMEDIATE...
10,Kennedy,data/grimmerPressReleases\Kennedy\01Dec2005Ken...,FOR IMMEDIATE RELEASE Washington ...
20,Kennedy,data/grimmerPressReleases\Kennedy\01Feb2006Ken...,FOR IMMEDIATE RELEASE Fact sheet...
30,Kennedy,data/grimmerPressReleases\Kennedy\01Feb2007Ken...,FOR IMMEDIATE RELEASE Washington ...
40,Kennedy,data/grimmerPressReleases\Kennedy\01Jun2007Ken...,FOR IMMEDIATE RELEASE BOSTON MA Se...
50,Kennedy,data/grimmerPressReleases\Kennedy\01Mar2007Ken...,FOR IMMEDIATE RELEASE Washington ...
60,Kennedy,data/grimmerPressReleases\Kennedy\01May2007Ken...,FOR IMMEDIATE RELEASE The President ...
70,Kennedy,data/grimmerPressReleases\Kennedy\01Nov2007Ken...,FOR IMMEDIATE RELEASE Washington DC...
80,Kennedy,data/grimmerPressReleases\Kennedy\02Aug2006Ken...,FOR IMMEDIATE RELEASE FOR IMMEDIATE ...
90,Kennedy,data/grimmerPressReleases\Kennedy\02Feb2005Ken...,FOR IMMEDIATE RELEASE The Preside...


# Stemming is taking a really long time do to the size of the dataset, so it's been disabled for now

We also want to remove stop words and stem, but tokenizing requires two steps. Word2Vec wants to know the sentence structure as well as simply the words, so the tokenizing is slightly different this time.

In [134]:
#Define the same function as last week
def normlizeTokens(tokenLst, stopwordLst = None, stemmer = None, lemmer = None):
    #We can use a generator here as we just need to iterate over it

    #Lowering the case and removing non-words
    workingIter = (w.lower() for w in tokenLst if w.isalpha())

    #Now we can use the semmer, if provided
    if stemmer is not None:
        workingIter = (stemmer.stem(w) for w in workingIter)

    #And the lemmer
    if lemmer is not None:
        workingIter = (lemmer.lemmatize(w).encode('utf8') for w in workingIter)

    #And remove the stopwords
    if stopwordLst is not None:
        workingIter = (w for w in workingIter if w not in stopwordLst)
    #We will return a list with the stopwords removed
    return list(workingIter)

#initialize our stemmer and our stop words
stop_words_nltk = nltk.corpus.stopwords.words('english')
snowball = nltk.stem.snowball.SnowballStemmer('english')
wordnet = nltk.stem.WordNetLemmatizer()

In [None]:
#Apply our functions, notice each row is a list of lists now
senReleasesDF['tokenized_sents'] = senReleasesDF['text'].apply(lambda x: [nltk.word_tokenize(s) for s in nltk.sent_tokenize(x)])
senReleasesDF['normalized_sents'] = senReleasesDF['tokenized_sents'].apply(lambda x: [normlizeTokens(s, stopwordLst = stop_words_nltk, stemmer = None) for s in x])

senReleasesDF[:100:10]

# Word2Vec

We will be using the gensim implementation of [Word2Vec](https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec).

To load our data our data we give all the sentences to the trainer

In [30]:
senReleasesW2V = gensim.models.word2vec.Word2Vec(senReleasesDF['normalized_sents'].sum())

Now we can look at a few things

In [31]:
senReleasesW2V.most_similar('president')

[('administration', 0.7934945225715637),
 ('presidents', 0.7433978319168091),
 ('administrations', 0.6810588836669922),
 ('george', 0.6101688146591187),
 ('cheney', 0.5860358476638794),
 ('ronald', 0.5486534237861633),
 ('responds', 0.5284579992294312),
 ('republican', 0.5097452998161316),
 ('rollback', 0.5044845342636108),
 ('reject', 0.5019153356552124)]

Get the vector

In [9]:
senReleasesW2V['president']

array([  5.98172605e-01,   4.08544064e-01,  -2.00023651e-02,
         6.83706284e-01,   2.35091284e-01,   1.51089147e-01,
        -3.50266784e-01,  -1.78749275e+00,  -5.86134017e-01,
         3.16251256e-02,   9.05968964e-01,   5.65631092e-01,
        -6.18162632e-01,   1.28233993e+00,  -8.36357772e-01,
        -9.27805007e-01,   2.24624681e+00,   1.43553746e+00,
        -4.64852333e-01,  -8.20181429e-01,   1.01894200e+00,
        -3.12417209e-01,  -1.17840540e+00,   8.21972936e-02,
        -4.99140956e-02,   1.39575958e+00,   1.46494284e-01,
         1.27444255e+00,  -1.05740400e-02,  -1.64721191e+00,
        -3.50008100e-01,  -1.23521760e-01,  -1.50235698e-01,
         1.78061342e+00,  -7.88266510e-02,   5.62229812e-01,
        -7.34569281e-02,  -3.37678730e-01,  -1.54030669e+00,
         1.08246878e-01,  -7.28343070e-01,  -4.37717676e-01,
        -9.53186810e-01,   9.64138806e-01,  -7.94973791e-01,
         1.58517861e+00,  -1.26691723e+00,  -5.57923675e-01,
         3.41233611e-02,

Get all the vectors

In [10]:
senReleasesW2V.syn0

array([[ 1.09916329,  0.37337583, -0.30811408, ...,  2.64301944,
        -0.14985117, -0.47056434],
       [ 1.59650826,  1.67542541, -1.05237496, ...,  2.99215102,
        -1.2940805 ,  0.85386258],
       [ 0.59042859,  0.29354835, -0.11482511, ...,  0.39653182,
         0.65785706, -0.88093102],
       ..., 
       [ 0.05379788,  0.07380242, -0.03090563, ...,  0.03676279,
        -0.02846254, -0.01079885],
       [-0.03350566,  0.01471397,  0.02765373, ..., -0.07439245,
        -0.02612949, -0.00890458],
       [ 0.05079404,  0.04173233, -0.06164182, ...,  0.0689274 ,
         0.01979096,  0.04595068]], dtype=float32)

Find what doesn't fit

In [11]:
senReleasesW2V.doesnt_match(['she', 'he', 'her', 'him', 'washington'])

'her'

Or save for use later

In [12]:
senReleasesW2V.save("data/senpressreleasesWORD2Vec")

# APS abstracts

In [136]:
apsDF = pandas.read_csv('data/APSabstracts1950s.csv', index_col = 0)
apsDF['tokenized_sents'] = apsDF['abstract'].apply(lambda x: [nltk.word_tokenize(s) for s in nltk.sent_tokenize(x)])
apsDF['normalized_sents'] = apsDF['tokenized_sents'].apply(lambda x: [normlizeTokens(s, stopwordLst = stop_words_nltk, lemmer = wordnet) for s in x])

Here, let's change some parameters. Let's use skipgrams instead of CBOW (continous bag of words), have a 200-dimensional vector space, only keep words that appear more than 2 times, and iterate the training algorithm over the corpus 10 times. For more information about parametrizing word2vec models, please see [here](https://radimrehurek.com/gensim/models/word2vec.html) and [here](https://code.google.com/archive/p/word2vec/).

In [144]:
apsW2V = gensim.models.word2vec.Word2Vec(apsDF['normalized_sents'].sum(), sg = 1, size = 200, min_count= 2, iter=10)

Let's save the model.

In [56]:
apsW2V.save('data/apsW2V')

We can then also load it.

In [57]:
apsW2V = gensim.models.word2vec.Word2Vec.load('data/apsW2V')

Now let's do some vector algebra.

In [145]:
apsW2V.most_similar(positive = ['newton', 'relativity'], negative = ['einstein'], topn = 5)

[('guaranteeing', 0.7645862102508545),
 ('papapetrou', 0.7472379207611084),
 ('biquadratic', 0.7393147945404053),
 ('speck', 0.7356243133544922),
 ('admitting', 0.734847903251648)]

# News from The New York Times 

In [168]:
paragraphs = []
f = open('data/nytimes_full.txt', 'r')
for row in f:
    if row != '\n' and  row != "';\n":
        paragraphs.append(row)
f.close()

In [169]:
indices = random.sample(range(len(paragraphs)), 5000)

In [170]:
sample = [paragraphs[i] for i in sorted(indices)]