# Dynamic Topic Model

This notebook includes code for creating window topic models for each year of congressional speech in the House for each party, and a dynamic topic model as per the Dynamic Non-Negative Matrix Factorization approach described by Greene (2019). 


Previous development of these models indicates that the most interpretable and coherent models generally fall between 45 and 60 topics for each year/party respectively. Little difference is made in the interpretability and coherence of models within this range. For this reason a middleground of 50 topics is used for all models.

In [2]:
import pandas as pd
import numpy as np

import gensim

from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from multiprocess import Pool

import os
import joblib

import boto3
client = boto3.client('s3')

# Code from https://github.com/derekgreene/dynamic-nmf
%run Greene_dnmf.py

# get procedural stop words
%run procedural_stop_words.py

In [3]:
def term_rankings(H,terms,ntop):
    term_rankings = []
    for topic_index in range(H.shape[0]):
        top_indices = np.argsort(H[topic_index,:])[::-1]
        term_ranking = [terms[i] for i in top_indices[:ntop]]
        term_rankings.append(term_ranking)
    return term_rankings

def get_top_words(vect):
    splits = [[z for z in i.split() if z in vocab] for i in vect]
    docs = [x for sublist in splits for x in sublist]
    counts = Counter(docs)
    top_10 = [i[0] for i in counts.most_common()][:20]
    return top_10

In [33]:
def run_window_NMF(congress):
    
    """
    Reads in data for a given congress, runs NMF models for every year 
    in that congress for a specific party.
    
    congress: what congress to use
    """

    k = 65
    
    # read in data from S3
    DF = pd.read_csv(client.get_object(Bucket='ascsagemaker',
                                       Key=f'JMP_congressional_nmf/House_bigrams/{congress:0>3}_fixed_party.csv')['Body'])
    
    # remove speeches with no party labels
    DF = DF.loc[-DF.party_y.isnull()]

    # partse to only the house and party of interest
    DF = DF.loc[(DF.chamber_x == 'H')]

    DF['date'] = pd.to_datetime(DF.date)  # to date time
    
    years = pd.to_datetime(DF.date).dt.year.unique() # what years are included in this congress
    if congress == 112:  #  112th congress includes overlap year with 113th
        years = years[:2]
    models = []
    
    #  for each year run a NMF window topic model
    for year in years:
        sub_df = DF.loc[DF.date.dt.year == year]

        # prepare TfIDF DTM
        vectorizer = TfidfVectorizer(min_df=0.001,max_df=0.30,stop_words=procedural_stop_words,use_idf=True)
        dtm = vectorizer.fit_transform(sub_df.speech_processed)
        vocab = vectorizer.get_feature_names()

        # run model
        model = NMF(n_components=k,max_iter=5000,init='nndsvd')
        W = model.fit_transform(dtm)
        H = model.components_
        print(f'{year} - {len(sub_df)} speeches')
        
        # return information packet
        models.append({"W":W,"H":H,"vocab":vocab,"window_labels":[f'{year}_{i}' for i in range(100)],"DF_index":DF.index})
        
    return models

## Run batch process of every year (34 models)

In [34]:
with Pool(10) as p:
    output = p.map(run_window_NMF,range(98,115))

1983 - 19450 speeches
1997 - 18073 speeches
1993 - 18404 speeches
1991 - 18944 speeches
1987 - 19235 speeches
1998 - 18098 speeches
1999 - 18494 speeches
2001 - 15038 speeches
1985 - 21288 speeches
1984 - 20144 speeches
1988 - 16095 speeches
1994 - 16559 speeches
1989 - 14525 speeches
1992 - 15767 speeches
2002 - 12114 speeches
2003 - 17069 speeches
2009 - 14239 speeches
1995 - 30208 speeches
1986 - 17440 speeches
1990 - 18562 speeches
2011 - 17205 speeches
2013 - 13921 speeches
2000 - 17704 speeches
2005 - 17568 speeches
2012 - 12306 speeches
2007 - 24529 speeches
2014 - 11670 speeches
2004 - 14759 speeches
2015 - 14138 speeches
2010 - 15235 speeches
2016 - 8806 speeches
2008 - 14826 speeches
2006 - 15252 speeches
1996 - 17943 speeches


## Prepare window NMF for dynamic level

In [14]:
outputs = [model for sublist in output for model in sublist]

for out in outputs:
    out['year'] = int(out['window_labels'][0].split('_')[1])
    out['party'] = out['window_labels'][0].split('_')[0]
    out['topics'] = term_rankings(out['H'],out['vocab'],ntop=10)
    
collection = TopicCollection()
for model in outputs:
    collection.add_topic_model(model['H'],model['vocab'],model['window_labels'])
    
Mat, full_vocab = collection.create_matrix()

## Run Dynamic Model
Use a range of K and test them out

In [15]:
checks = []
for k in range(50,150,10):
    second_level = NMF(n_components=k,max_iter=5000,init='nndsvda')
    W = second_level.fit_transform(Mat)
    H = second_level.components_
    terms = term_rankings(H,full_vocab,ntop=20)
    print(k)

50


KeyboardInterrupt: 

In [25]:
terms

[['information',
  'agency',
  'public',
  'datum',
  'intelligence',
  'require',
  'privacy',
  'cybersecurity',
  'share',
  'use',
  'internet',
  'disclosure',
  'protect',
  'library',
  'transparency',
  'network',
  'concern',
  'cyber',
  'private',
  'available'],
 ['business',
  'schedule',
  'expect',
  'next_week',
  'suspension_rule',
  'list',
  'announce',
  'economy',
  'company',
  'hire',
  'noon',
  'oclock',
  'close',
  'postpone',
  'several',
  'morninghour',
  'small',
  'conclude',
  'uncertainty',
  'grow'],
 ['vote',
  'motion_recommit',
  'final_passage',
  'opportunity',
  'miss',
  'defeat',
  'chance',
  'unable',
  'vote_yea',
  'voting',
  'detain',
  'body',
  'cast',
  'today',
  'yea',
  'roll',
  'last',
  'cast_vote',
  'reason',
  'side_aisle'],
 ['state',
  'governor',
  'department',
  'local',
  'union',
  'federal_government',
  'home',
  'secretary',
  'whole',
  'grant',
  'require',
  'federal',
  'state_legislature',
  'own',
  'compact',

The process for labelling topics involves first reading the dynamic topic label and establishing a code from the CAP. Next, the window topic descriptions are investigated to look for consistency with the main topic and label confirmed, window topics are labelled with more unique names, or a miss labelling is marked. After this procedure, a final close reading of 2 documents from each window are selected for final confirmation. 

In [31]:
second_level = NMF(n_components=100,max_iter=5000,init='nndsvd')
W = second_level.fit_transform(Mat)
H = second_level.components_
terms = term_rankings(H,full_vocab,ntop=10)

In [32]:
for ix,topic in enumerate(terms):

    print(topic,end='\n\n')

['nation', 'world', 'united', 'great', 'freedom', 'history', 'democracy', 'international', 'economic', 'future']

['tax', 'taxis', 'revenue', 'tax_code', 'taxpayer', 'income', 'rate', 'irs', 'raise_taxis', 'capital_gain']

['way', 'fact', 'mean', 'kind', 'happen', 'find', 'believe', 'lot', 'able', 'suggest']

['education', 'teacher', 'educational', 'school_district', 'department', 'improve', 'investment', 'local', 'elementary_secondary', 'classroom']

['vote', 'motion_recommit', 'final_passage', 'miss', 'defeat', 'voting', 'detain', 'chance', 'unable', 'vote_yea']

['budget', 'budget_resolution', 'balance', 'priority', 'alternative', 'proposal', 'propose', 'fiscal_year', 'fiscal', 'deficit']

['state', 'governor', 'department', 'local', 'union', 'federal_government', 'home', 'whole', 'secretary', 'grant']

['veteran', 'va', 'veteran_affair', 'benefit', 'care', 'compensation', 'serve', 'world_war', 'hospital', 'sacrifice']

['president', 'veto', 'sign', 'white_house', 'send', 'president

In [60]:
import sys

In [123]:
Final = {"window_models":outputs,"dynamic_model":{"k":120,'H':H,'W':W,'collection_mat':Mat,'collection_vocab':full_vocab},'mapper':mapper}

In [94]:
outputs[0]['DF_index']

Int64Index([   85,    87,    88,    89,    90,    91,    92,    93,    94,
               95,
            ...
            77805, 77809, 77810, 77811, 77814, 77816, 77818, 77819, 77820,
            77821],
           dtype='int64', length=14277)

In [71]:
dynamic_labels = pd.read_csv('120 topics.csv')

In [124]:
with open('Official_results_final_models.pkl','wb') as File:
    joblib.dump(Final,File)

In [95]:
dynamic_labels

Unnamed: 0,dynamic_topic_ix,Major Topic,Sub_topic
0,0,,
1,1,procedural,
2,2,macroeconomics,national budget
3,3,government operations,intergovernmental relations
4,4,procedural,
...,...,...,...
115,115,defense,foreign operations
116,116,health,medical facilities
117,117,,
118,118,law crime,white collar crime


In [None]:
# assigned_topics = W.argmax(1)
# assigned_labels = [dynamic_labels['Major Topic'].values[i] for i in assigned_topics]
# mapper = []
# for i in range(len(collection.topic_ids)):
#     party,year,topic = collection.topic_ids[i].split('_')
#     topic_label = assigned_labels[i]
#     topic_ix = assigned_topics[i]
#     mapper.append({"party":party,'year':year,'topic_id':topic,'dynamic_label':topic_label})

# mapper = pd.DataFrame(mapper)

# def get_topic_terms(x):
#     party = x['party']
#     year = int(x['year'])
#     ix = int(x['topic_id'])
#     for out in outputs:
#         if out['year'] == year and out['party'] == party:
#             return out['topics'][ix]
        
        
# mapper['window_terms'] = mapper.apply(get_topic_terms,1)
# mapper = mapper.sort_values(by='year',ascending=True)

In [116]:
party,year,topic = collection.topic_ids[1].split('_')

In [119]:
assigned_topics[1]

10

In [120]:
mapper

Unnamed: 0,party,year,topic_id,dynamic_label,window_terms
0,Rep,1981,0,28,"[land, county, federal, agricultural, resource..."
2551,Dem,1981,31,23,"[want, give, see, help, job, good, way, put, f..."
2550,Dem,1981,30,111,"[request, amount, reduction, supplemental, tot..."
2549,Dem,1981,29,6,"[child, woman, family, mother, care, parent, s..."
2548,Dem,1981,28,10,"[rate, high_interest, interest_rate, economic,..."
...,...,...,...,...,...
2497,Rep,2016,47,105,"[bank, financial_institution, financial, small..."
2498,Rep,2016,48,104,"[go, look, come, happen, see, know, number, te..."
2499,Rep,2016,49,91,"[internet, fcc, regulate, rate, broadband, con..."
2493,Rep,2016,43,20,"[small_business, access_capital, contract, bus..."


In [49]:
assigned_topics = W.argmax(1)

mapper = []
for i in range(len(collection.topic_ids)):
    party,year,topic = collection.topic_ids[i].split('_')
    topic_label = assigned_topics[i]
    mapper.append({"party":party,'year':year,'topic_id':topic,'dynamic_label':topic_label})

mapper = pd.DataFrame(mapper)

def get_topic_terms(x):
    party = x['party']
    year = int(x['year'])
    ix = int(x['topic_id'])
    for out in outputs:
        if out['year'] == year and out['party'] == party:
            return out['topics'][ix]
        
        
mapper['window_terms'] = mapper.apply(get_topic_terms,1)
mapper = mapper.sort_values(by='year',ascending=True)

In [50]:
mapper

Unnamed: 0,party,year,topic_id,dynamic_label,window_terms
0,Rep,1981,0,8,"[president, president_reagan, proposal, plan, ..."
2901,Dem,1981,21,86,"[voting_right, minority, right, voting, citize..."
2902,Dem,1981,22,13,"[tax, way_mean, tax_cut, taxis, small_business..."
2903,Dem,1981,23,104,"[rate, high_interest, interest_rate, small_bus..."
2904,Dem,1981,24,39,"[law_enforcement, drug, arrest, equipment, age..."
...,...,...,...,...,...
2855,Rep,2016,55,75,"[north_korea, north_korean, regime, sanction, ..."
2856,Rep,2016,56,43,"[housing, public_housing, opportunity, hud, as..."
2857,Rep,2016,57,54,"[medicaid, care, patient, provider, physician,..."
2839,Rep,2016,39,84,"[employee, federal, hire, employer, official, ..."


In [55]:
val = 117
print("---------------------------------------------------")
print(terms[val])
print("---------------------------------------------------\n\n")
for i in mapper.loc[mapper.dynamic_label == val].iterrows():
    print(i[1]['party'] + '-' + i[1]['year'] + '-' + i[1]['topic_id'])
    print(i[1]['window_terms'])
    print('________________________\n')

---------------------------------------------------
['gun', 'firearm', 'gun_violence', 'weapon', 'criminal', 'ban', 'handgun', 'background_check', 'assault_weapon', 'kill']
---------------------------------------------------


Dem-1986-25
['handgun', 'gun', 'firearm', 'crime', 'law_enforcement', 'criminal', 'dealer', 'volkmer', 'sportsman', 'police']
________________________

Rep-1986-45
['handgun', 'firearm', 'volkmer_substitute', 'gun', 'law_enforcement', 'criminal', 'sportsman', 'transport', 'crime', 'hughe']
________________________

Dem-1988-45
['handgun', 'crime', 'gun', 'brady', 'hate_crime', 'law_enforcement', 'police', 'firearm', 'wait_period', 'violence']
________________________

Dem-1989-65
['production', 'weapon', 'arm_control', 'chemical_weapon', 'plutonium', 'agreement', 'negotiation', 'ban', 'treaty', 'produce']
________________________

Rep-1991-32
['brady_bill', 'criminal', 'gun', 'stagger', 'handgun', 'brady', 'system', 'background_check', 'check', 'firearm']
_______