# Dynamic Topic Model

This notebook includes code for creating window topic models for each year of congressional speech in the House for each party, and a dynamic topic model as per the Dynamic Non-Negative Matrix Factorization approach described by Greene (2019). 


Previous development of these models indicates that the most interpretable and coherent models generally fall between 45 and 60 topics for each year/party respectively. Little difference is made in the interpretability and coherence of models within this range. For this reason a middleground of 50 topics is used for all models.

In [1]:
import pandas as pd
import numpy as np

import gensim

from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from multiprocess import Pool

import os
import joblib

from tqdm import tqdm
import boto3
client = boto3.client('s3')

# Code from https://github.com/derekgreene/dynamic-nmf
%run Greene_dnmf.py

# get procedural stop words
%run procedural_stop_words.py

# model evaluation tool 
%run Model_description_evaluation_widget.py
%matplotlib inline


In [2]:
def term_rankings(H,terms,ntop):
    term_rankings = []
    for topic_index in range(H.shape[0]):
        top_indices = np.argsort(H[topic_index,:])[::-1]
        term_ranking = [terms[i] for i in top_indices[:ntop]]
        term_rankings.append(term_ranking)
    return term_rankings

def get_top_words(vect):
    splits = [[z for z in i.split() if z in vocab] for i in vect]
    docs = [x for sublist in splits for x in sublist]
    counts = Counter(docs)
    top_10 = [i[0] for i in counts.most_common()][:20]
    return top_10

In [11]:
def run_window_NMF(congress):
    
    """
    Reads in data for a given congress, runs NMF models for every year 
    in that congress for a specific party.
    
    congress: what congress to use
    """

    k = 45
    
    # read in data from S3
    DF = pd.read_csv(client.get_object(Bucket='ascsagemaker',
                                       Key=f'JMP_congressional_nmf/House_bigrams/{congress:0>3}_fixed_party.csv')['Body'])
    
    # remove speeches with no party labels
    DF = DF.loc[-DF.party_y.isnull()]

    # partse to only the house and party of interest
    DF = DF.loc[(DF.chamber_x == 'H')]

    DF['date'] = pd.to_datetime(DF.date)  # to date time
    
    years = pd.to_datetime(DF.date).dt.year.unique() # what years are included in this congress
    if congress == 112:  #  112th congress includes overlap year with 113th
        years = years[:2]
    models = []
    
    #  for each year run a NMF window topic model
    for year in years:
        sub_df = DF.loc[DF.date.dt.year == year]

        # prepare TfIDF DTM
        vectorizer = TfidfVectorizer(min_df=0.001,max_df=0.3,stop_words=procedural_stop_words,use_idf=True,)
        dtm = vectorizer.fit_transform(sub_df.speech_processed)
        vocab = vectorizer.get_feature_names()

        # run model
        model = NMF(n_components=k,max_iter=5000,init='nndsvd')
        W = model.fit_transform(dtm)
        H = model.components_
        print(f'{year} - {len(sub_df)} speeches, vocab {len(vocab)}')
        
        # return information packet
        models.append({"W":W,"H":H,
                       "vocab":vocab,
                       "window_labels":[f'{year}_{i}' for i in range(100)],
                       "year":year,
                       "DF_index":DF.index,
                       "topics":term_rankings(H,vocab,ntop=10)})
        
    return models

## Run batch process of every year (33 models)

In [10]:
with Pool(10) as p:
    output = p.map(run_window_NMF,range(98,115))

1989 - 14525 speeches, vocab 8266
1987 - 19235 speeches, vocab 8088
1985 - 21288 speeches, vocab 7567
1999 - 18494 speeches, vocab 8007
1983 - 19450 speeches, vocab 7565
1990 - 18562 speeches, vocab 8234
1993 - 18404 speeches, vocab 7705
1995 - 30208 speeches, vocab 6725
1988 - 16095 speeches, vocab 8069
1997 - 18073 speeches, vocab 7573
1996 - 17943 speeches, vocab 7580
1994 - 16559 speeches, vocab 8204
1991 - 18944 speeches, vocab 8387
1986 - 17440 speeches, vocab 7931
2005 - 17568 speeches, vocab 8775
1998 - 18098 speeches, vocab 7675
2001 - 15038 speeches, vocab 8018
2000 - 17704 speeches, vocab 8048
2003 - 17069 speeches, vocab 8340
2002 - 12114 speeches, vocab 8088
1992 - 15767 speeches, vocab 8416
2006 - 15252 speeches, vocab 8530
2013 - 13921 speeches, vocab 7807
1984 - 20144 speeches, vocab 7501
2009 - 14239 speeches, vocab 7854
2011 - 17205 speeches, vocab 7424
2007 - 24529 speeches, vocab 8431
2015 - 14138 speeches, vocab 7829
2004 - 14759 speeches, vocab 8516
2016 - 8806 sp

## Prepare window NMF for dynamic level

In [11]:
outputs = [model for sublist in output for model in sublist]

collection = TopicCollection()
for model in outputs:
    collection.add_topic_model(model['H'],model['vocab'],model['window_labels'])
    
Mat, full_vocab = collection.create_matrix()

## Run Dynamic Model


In [106]:
second_level = NMF(n_components=80,max_iter=5000,init='nndsvd')
W = second_level.fit_transform(Mat)
H = second_level.components_
terms = term_rankings(H,full_vocab,ntop=10)

In [63]:
W = Models['dynamic_model']['W']
terms = Models['dynamic_model']['terms']
term_list = [term for sublist in Models['window_models'] for term in sublist['topics']]
mapped_df = pd.DataFrame({"window_descriptions":term_list,
                          'window_id':[' - '.join(i.split('_')) for i in collection.topic_ids],
                          'dynamic_id':W.argmax(1)})


box = run_widget(terms,mapped_df)
box

Box(children=(Dropdown(description='Dynamic Topic #:', options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, …

In [2]:
dynamic_labels = ['tribute', 'veterans', 'taxes', 'natural_resources_water', 'small_business',
 'research_science', 'abortion', 'food_assistance', 'agriculture', 'housing', 'taxes', 'veterans',
 'employment', 'crime', 'israel', 'healthcare', 'justice_courts', 'social_security', 'domestic_commerce',
 'national_debt', 'public_lands', 'procedural', 'procedural', 'transportation', 'drugs', 'space',
 'energy_oilgas', 'procedural', 'trade', 'nuclear_weapons', 'procedural', 'international_humanRights',
 'medicare', 'immigration', 'NA', 'intelligence', 'health_insurance', 'tribute', 'waters_coastguard',
 'unemployment', 'taxes', 'environment', 'arts', 'higher_education', 'procedural', 'constitution',
 'armenian_genocide', 'budget', 'schools', 'womens_issues', 'guns', 'defense_conflicts', 'disasters',
 'appropriations', 'partisans', 'transportation_air', 'defense_weapons', 'procedural', 'procedural',
 'central_america', 'public_health', 'china', 'international_humanRights', 'trade', 'NA', 'labor',
 'civil_rights_flag', 'disaster_relief', 'welfare', 'procedural', 'NA', 'energy_oilgas', 'civil_rights',
 'labor_wages', 'campaign_finance', 'research_technology', 'elections', 'banking_finance', 'procedural',
 'lending']

### DataFrame to map window topics to dynamic topics

In [41]:
assigned_topics = W.argmax(1)
assigned_labels = [dynamic_labels[i] for i in assigned_topics]
mapper = []
for i in range(len(collection.topic_ids)):
    year,topic = collection.topic_ids[i].split('_')
    topic_label = assigned_labels[i]
    topic_ix = assigned_topics[i]
    mapper.append({'year':year,'topic_id':int(topic),'dynamic_label':topic_label})

mapper = pd.DataFrame(mapper)

def get_topic_terms(x):
    year = int(x['year'])
    ix = int(x['topic_id'])
    for out in outputs:
        if out['year'] == year:
            return out['topics'][ix]
        
        
mapper['window_terms'] = mapper.apply(get_topic_terms,1)
mapper = mapper.sort_values(by='year',ascending=True)

In [42]:
mapper

Unnamed: 0,year,topic_id,dynamic_label,window_terms
0,1983,0,justice_courts,"[procedure, investigation, staff, court, condu..."
25,1983,25,procedural,"[substitute, original, mica, wylie, amend, gen..."
26,1983,26,trade,"[trade, import, export, foreign, japanese, mar..."
27,1983,27,civil_rights,"[king, martin_luther, dr_king, holiday, nation..."
28,1983,28,civil_rights,"[civil_right, commissioner, independence, inde..."
...,...,...,...,...
1502,2016,17,food_assistance,"[food, nutrition, consumer, restaurant, calori..."
1503,2016,18,domestic_commerce,"[fcc, internet, rate, broadband, consumer, reg..."
1504,2016,19,banking_finance,"[bank, financial, financial_institution, regul..."
1494,2016,9,higher_education,"[student, college, high_education, university,..."


In [43]:
# Save Final Model/models
Final_output = {"window_models":outputs,
                'dynamic_model':{'k':95,'H':H,'W':W,'collection_mat':Mat,'collection_vocab':full_vocab,'terms':terms},
                'mapper':mapper}

with open('Official_TopicModel_80k.pkl','wb') as File:
    joblib.dump(Final_output,File)

## Assign labels to speeches

In [40]:
# load back in all the data
dfs = []
for congress in range(98,115): 

    DF2 = pd.read_csv(client.get_object(Bucket='ascsagemaker',
                                           Key=f'JMP_congressional_nmf/House_bigrams/{congress:0>3}_fixed_party.csv')['Body'])
    DF2['date'] = pd.to_datetime(DF2['date'])
    
    if congress == 112:
        DF2 = DF2.loc[DF2.date.dt.year != 2013]
        
    dfs.append(DF2)
    
ldf = pd.concat(dfs)
ldf = ldf.loc[-ldf.party_y.isnull()]

In [42]:
pd.options.mode.chained_assignment = None # suppress warning about slicing

code_mapper = Final_output['mapper'][['year','topic_id','dynamic_label']]
code_mapper['int_year'] = code_mapper.year.apply(lambda x: int(x))

labelled_df = []
for year in tqdm(ldf.date.dt.year.unique()):
    # subset given year and find window W matrix
    sub_df = ldf.loc[ldf.date.dt.year == year]
    window_model = [model for model in Models['window_models'] if model['year'] == year][0]
    sub_df.loc[:,'window_topic_id'] = window_model['W'].argmax(1)
    sub_df.loc[:,'topic_weight'] = [window_model['W'][ix,val] for ix,val in enumerate(window_model['W'].argmax(1))]
    # merge on the assigned dynamic topic and year
    sub_df = sub_df.merge(code_mapper,left_on=['year','window_topic_id'],right_on=['int_year','topic_id'],how='left')
    labelled_df.append(sub_df)
    
final_DF = pd.concat(labelled_df)

100%|██████████| 34/34 [00:03<00:00, 10.79it/s]


In [52]:
final_DF.to_csv('Results/All_speeches_labelled.csv')

## Semantic Validity In Speeches

In [3]:
final_DF = pd.read_csv('Results/All_speeches_labelled.csv')

with open('Results/Official_TopicModel_80k.pkl','rb') as File:
    Models = joblib.load(File)
    
dynamic_topics = Models['dynamic_model']['terms']
dynamic_df = pd.DataFrame({"labels":dynamic_labels,'terms':dynamic_topics})
mapper = Models['mapper']

In [11]:
dynamic_df.loc[dynamic_df.labels == 'disaster_relief'].terms.values

array([list(['emergency', 'disaster', 'fema', 'flood', 'flood_insurance', 'damage', 'offset', 'hurricane', 'relief', 'victim'])],
      dtype=object)

In [58]:
Box = Run_speech_widget(final_DF,mapper)

In [64]:
Box

Box(children=(Dropdown(description='Dynamic Topic #:', index=5, options=('procedural', 'energy_oilgas', 'inter…