In [27]:
pip install corpus-toolkit


Collecting corpus-toolkit
  Downloading corpus_toolkit-0.32-py3-none-any.whl (1.7 MB)
     |████████████████████████████████| 1.7 MB 27.0 MB/s            
[?25hInstalling collected packages: corpus-toolkit
Successfully installed corpus-toolkit-0.32
Note: you may need to restart the kernel to use updated packages.


In [28]:
import pandas as pd
import numpy as np

import gensim

from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from multiprocess import Pool
from corpus_toolkit import corpus_tools as ct

import os
import joblib

import boto3
client = boto3.client('s3')

# Code from https://github.com/derekgreene/dynamic-nmf
%run Greene_dnmf.py

# get procedural stop words
%run procedural_stop_words.py

It appears that you do not have spacy installed on your computer. Without installing Spacy, the tag(), and tag_corpus() functions won't work properly.
It appears that you haven't downloaded the default language model for Spacy 'en_core_web_sm'. If you intend to tag/parse your corpus, please make sure you have a model downloaded. If you wish to use a model other than the default one, then load it before proceeding: 'nlp = spacy.load('model_name')'


In [2]:
def term_rankings(H,terms,ntop):
    term_rankings = []
    for topic_index in range(H.shape[0]):
        top_indices = np.argsort(H[topic_index,:])[::-1]
        term_ranking = [terms[i] for i in top_indices[:ntop]]
        term_rankings.append(term_ranking)
    return term_rankings

def get_top_words(vect):
    splits = [[z for z in i.split() if z in vocab] for i in vect]
    docs = [x for sublist in splits for x in sublist]
    counts = Counter(docs)
    top_10 = [i[0] for i in counts.most_common()][:20]
    return top_10

In [9]:
def run_window_NMF(info):
    
    """
    Reads in data for a given congress, runs NMF models for every year 
    in that congress for a specific party.
    
    info: tuple of type (Party, congress)
    """
    party,congress = info

    k = 50
    
    # read in data from S3
    DF = pd.read_csv(client.get_object(Bucket='ascsagemaker',
                                       Key=f'JMP_congressional_nmf/House_bigrams/{congress:0>3}_fixed_party.csv')['Body'])
    
    # remove speeches with no party labels
    DF = DF.loc[-DF.party_y.isnull()]

    # partse to only the house and party of interest
    DF = DF.loc[(DF.chamber_x == 'H')]

    DF['date'] = pd.to_datetime(DF.date)  # to date time
    
    years = pd.to_datetime(DF.date).dt.year.unique() # what years are included in this congress
    if congress == 112:  #  112th congress includes overlap year with 113th
        years = years[:2]
    models = []
    
    #  for each year run a NMF window topic model
    for year in years:
        sub_df = DF.loc[DF.date.dt.year == year]
        
        # prepare TfIDF DTM
        vectorizer = TfidfVectorizer(min_df=0.001,max_df=0.30,stop_words=procedural_stop_words,use_idf=True)
        dtm = vectorizer.fit_transform(sub_df.speech_processed)
        vocab = vectorizer.get_feature_names()

        # run model
        model = NMF(n_components=k,max_iter=5000,init='nndsvd')
        W = model.fit_transform(dtm)
        H = model.components_
        print(f'{party} - {year} - {len(sub_df)} speeches')
        
        # return information packet
        models.append({"W":W,"H":H,"vocab":vocab,"window_labels":[f'{party}_{year}_{i}' for i in range(100)],"DF_index":DF.index})
        
    return models,DF

In [10]:
mod,DF = run_window_NMF(('Rep',98))

Rep - 1983 - 19450 speeches
Rep - 1984 - 20144 speeches


In [11]:
DF_1983 = DF.loc[DF.date.dt.year == 1983]
DF_1983['assigned_topic'] = mod[0]['W'].argmax(1)

In [59]:
term_rankings(mod[0]['H'],mod[0]['vocab'],10)[30]

['woman',
 'era',
 'equal_right',
 'abortion',
 'constitution',
 'women',
 'right',
 'equal',
 'economic_equity',
 'discrimination']

In [60]:
SS = DF_1983.loc[DF_1983.assigned_topic == 30]

In [61]:
SS_D = SS.loc[SS.party_y == 'D']
SS_R = SS.loc[SS.party_y == 'R']

In [62]:
D = ct.frequency(ct.tokenize([i for m in SS_D.speech_processed for i in m.split()]))
R = ct.frequency(ct.tokenize([i for m in SS_R.speech_processed for i in m.split()]))

In [63]:
corp_key = ct.keyness(D,R, effect = "log-ratio")
ct.head(corp_key, hits = 10) #to display top hits

poor	31.37620728722728
poverty	31.05427919233992
pickle_amendment	30.791244786506123
retire	30.717244205062347
hispanic	30.639241693061074
president_reagan	30.5567795328691
rape	30.46931669161876
secretary	30.37620728722728
white_house	30.276671613676367
reagan	30.276671613676367


In [64]:
corp_key = ct.keyness(R,D, effect = "log-ratio")
ct.head(corp_key, hits = 10) #to display top hits

ratify	31.266495327091516
simplistic	31.07385024914912
unconstitutional	30.381972544511452
madison	30.140964445007658
unborn_child	30.140964445007658
senator_hatch	30.140964445007658
adoption	30.003460921257723
method	30.003460921257723
saline	29.85145782781267
strict	29.85145782781267
