# Dynamic Topic Model

This notebook includes code for creating window topic models for each year of congressional speech in the House for each party, and a dynamic topic model as per the Dynamic Non-Negative Matrix Factorization approach described by Greene (2019). 


Previous development of these models indicates that the most interpretable and coherent models generally fall between 45 and 60 topics for each year/party respectively. Little difference is made in the interpretability and coherence of models within this range. For this reason a middleground of 50 topics is used for all models.

In [1]:
import pandas as pd
import numpy as np

import gensim

from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from multiprocess import Pool

import os
import joblib

import boto3
client = boto3.client('s3')

# Code from https://github.com/derekgreene/dynamic-nmf
%run Greene_dnmf.py

# get procedural stop words
%run procedural_stop_words.py

In [3]:
def term_rankings(H,terms,ntop):
    term_rankings = []
    for topic_index in range(H.shape[0]):
        top_indices = np.argsort(H[topic_index,:])[::-1]
        term_ranking = [terms[i] for i in top_indices[:ntop]]
        term_rankings.append(term_ranking)
    return term_rankings

def get_top_words(vect):
    splits = [[z for z in i.split() if z in vocab] for i in vect]
    docs = [x for sublist in splits for x in sublist]
    counts = Counter(docs)
    top_10 = [i[0] for i in counts.most_common()][:20]
    return top_10

In [33]:
def run_window_NMF(congress):
    
    """
    Reads in data for a given congress, runs NMF models for every year 
    in that congress for a specific party.
    
    congress: what congress to use
    """

    k = 65
    
    # read in data from S3
    DF = pd.read_csv(client.get_object(Bucket='ascsagemaker',
                                       Key=f'JMP_congressional_nmf/House_bigrams/{congress:0>3}_fixed_party.csv')['Body'])
    
    # remove speeches with no party labels
    DF = DF.loc[-DF.party_y.isnull()]

    # partse to only the house and party of interest
    DF = DF.loc[(DF.chamber_x == 'H')]

    DF['date'] = pd.to_datetime(DF.date)  # to date time
    
    years = pd.to_datetime(DF.date).dt.year.unique() # what years are included in this congress
    if congress == 112:  #  112th congress includes overlap year with 113th
        years = years[:2]
    models = []
    
    #  for each year run a NMF window topic model
    for year in years:
        sub_df = DF.loc[DF.date.dt.year == year]

        # prepare TfIDF DTM
        vectorizer = TfidfVectorizer(min_df=0.001,max_df=0.30,stop_words=procedural_stop_words,use_idf=True,)
        dtm = vectorizer.fit_transform(sub_df.speech_processed)
        vocab = vectorizer.get_feature_names()

        # run model
        model = NMF(n_components=k,max_iter=5000,init='nndsvd')
        W = model.fit_transform(dtm)
        H = model.components_
        print(f'{year} - {len(sub_df)} speeches')
        
        # return information packet
        models.append({"W":W,"H":H,
                       "vocab":vocab,
                       "window_labels":[f'{year}_{i}' for i in range(100)],
                       "year":year,
                       "DF_index":DF.index,
                       "topics":term_rankings(H,vocab,ntop=10)})
        
    return models

## Run batch process of every year (34 models)

In [34]:
with Pool(10) as p:
    output = p.map(run_window_NMF,range(98,115))

1983 - 19450 speeches
1997 - 18073 speeches
1993 - 18404 speeches
1991 - 18944 speeches
1987 - 19235 speeches
1998 - 18098 speeches
1999 - 18494 speeches
2001 - 15038 speeches
1985 - 21288 speeches
1984 - 20144 speeches
1988 - 16095 speeches
1994 - 16559 speeches
1989 - 14525 speeches
1992 - 15767 speeches
2002 - 12114 speeches
2003 - 17069 speeches
2009 - 14239 speeches
1995 - 30208 speeches
1986 - 17440 speeches
1990 - 18562 speeches
2011 - 17205 speeches
2013 - 13921 speeches
2000 - 17704 speeches
2005 - 17568 speeches
2012 - 12306 speeches
2007 - 24529 speeches
2014 - 11670 speeches
2004 - 14759 speeches
2015 - 14138 speeches
2010 - 15235 speeches
2016 - 8806 speeches
2008 - 14826 speeches
2006 - 15252 speeches
1996 - 17943 speeches


## Prepare window NMF for dynamic level

In [84]:
Mat.shape

(2210, 23069)

In [61]:
outputs = [model for sublist in output for model in sublist]

collection = TopicCollection()
for model in outputs:
    collection.add_topic_model(model['H'],model['vocab'],model['window_labels'])
    
Mat, full_vocab = collection.create_matrix()

## Run Dynamic Model

After testing many different window K and dynamic K the following were found. After a window K of 50, there is little difference in the number of substantive topics identified. After windowK of 70 topics seem to start repeating more than twice. a window topic K of 65 was thus chosen for all window models. 

Dynamic K was selected by testing a range between 65 and 100 by increments of 5. Meaningful separation of topics was found at 95 topics. As such this number, 95, was selected for the final model. 

In [45]:
second_level = NMF(n_components=95,max_iter=5000,init='nndsvd')
W = second_level.fit_transform(Mat)
H = second_level.components_
terms = term_rankings(H,full_vocab,ntop=10)

In [46]:
for ix,topic in enumerate(terms):
    print(topic,end='\n\n')

['provide', 'assistance', 'additional', 'include', 'request', 'help', 'ensure', 'resource', 'grant', 'necessary']

['pay', 'taxis', 'raise', 'taxpayer', 'salary', 'payment', 'price', 'interest', 'income', 'wage']

['serve', 'honor', 'great', 'life', 'man', 'miss', 'many', 'congressman', 'remember', 'love']

['state', 'governor', 'department', 'union', 'local', 'federal_government', 'home', 'whole', 'secretary', 'medicaid']

['vote', 'motion_recommit', 'final_passage', 'voting', 'miss', 'defeat', 'detain', 'vote_yea', 'chance', 'unable']

['budget', 'budget_resolution', 'balance', 'priority', 'alternative', 'proposal', 'fiscal_year', 'deficit', 'propose', 'fiscal']

['benefit', 'help', 'unemployed', 'unemployment', 'unemployment_benefit', 'extend_unemployment', 'emergency', 'extend', 'welfare', 'insurance']

['veteran', 'va', 'veteran_affair', 'benefit', 'care', 'compensation', 'nation', 'serve', 'world_war', 'hospital']

['president', 'veto', 'sign', 'white_house', 'send', 'president_b

In [2]:
dynamic_labels = ['fanancial_assistance','taxes_income', 'tribute', 'government_state', 'procedural', 'budget',
                  'unemployment_benefits', 'veterans', 'executiveBranch', 'spending', 'NA', 'family_children',
                  'education_higherEd', 'domesticCommerce', 'womensHealth', 'civilRights',
                  'domesticCommerce_smallBusiness', 'taxes_general', 'family_benefits', 'government_federal',
                  'partisanship', 'spending_reduce', 'macroeconomics_jobs', 'infrastructure_water', 'procedural',
                  'publicLand', 'procedural', 'defense_procurement', 'healthcare_insurance', 'government_general',
                  'procedural', 'tribute', 'socialSecurity', 'healthcare_retirement', 'agriculture', 'drugs',
                  'energy', 'spending_general', 'researchTech', 'abortion', 'procedural', 'NA',
                  'communityDevelopment', 'procedural', 'spending_general', 'procedural',
                  'internationalAffairs_middleEast', 'politicalCampaigns', 'foreignTrade', 'procedural',
                  'foreignTrade', 'healthcare_retirement', 'domesticCommerce_banking', 'naturalResources_water',
                  'internationalAffairs_china', 'crime_violent', 'housing_general', 'war', 'taxes_general', 'NA',
                  'internationalAffairs_iran', 'procedural', 'justice_courts', 'tribute', 'energy_oil', 'procedural',
                  'procedural', 'defense_intelligence', 'internationalAffairs_foreignAid', 'NA',
                  'internationalAffairs_armenian', 'government_DC', 'macroeconomics_debt',
                  'internationalAffairs_russia', 'procedural', 'defense_general', 'education_schools',
                  'internationalAffairs_general', 'spending_arts', 'infrastructure_transportation',
                  'defense_coastgaurd', 'defense_homelandSecurity', 'environment_general', 'crime_guns',
                  'civilRights_flags', 'government_employees', 'NA', 'procedural', 'labor_general',
                  'researchTech_space', 'regulation_general', 'immigration_general',
                  'internationalAffairs_centralAmerica', 'NA', 'budget']

### DataFrame to map window topics to dynamic topics

In [112]:
assigned_topics = W.argmax(1)
assigned_labels = [dynamic_labels[i] for i in assigned_topics]
mapper = []
for i in range(len(collection.topic_ids)):
    year,topic = collection.topic_ids[i].split('_')
    topic_label = assigned_labels[i]
    topic_ix = assigned_topics[i]
    mapper.append({'year':year,'topic_id':topic,'dynamic_label':topic_label})

mapper = pd.DataFrame(mapper)

def get_topic_terms(x):
    year = int(x['year'])
    ix = int(x['topic_id'])
    for out in outputs:
        if out['year'] == year:
            return out['topics'][ix]
        
        
mapper['window_terms'] = mapper.apply(get_topic_terms,1)
mapper = mapper.sort_values(by='year',ascending=True)

In [118]:
# Save Final Model/models
Final_output = {"window_models":outputs,
                'dynamic_model':{'k':95,'H':H,'W':W,'collection_mat':Mat,'collection_vocab':full_vocab,'terms':terms},
                'mapper':mapper}

with open('Official_TopicModel_95k.pkl','wb') as File:
    joblib.dump(Final_output,File)