In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
%matplotlib inline
plt.style.use('fivethirtyeight')

import warnings
warnings.simplefilter("ignore", DeprecationWarning)

In [289]:
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.decomposition import NMF

from src.text_pipeline import text_to_vector
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
all_transcripts = pd.read_csv('tables/all_transcripts')
emotions = pd.read_csv('NRC-Emotion-Lexicon/NRC-Emotion-Lexicon-v0.92/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt',
                       names=['word', 'emotion', 'association'], skiprows=45, sep='\t')


In [150]:
all_transcripts = all_transcripts[all_transcripts['Speaker'] != 'speaker']
all_transcripts = all_transcripts[all_transcripts['Speaker'] != 'bennet']
all_transcripts = all_transcripts[all_transcripts['Speaker'] != 'question']

In [226]:
bow_transcripts = all_transcripts.groupby('Speaker')['Line'].apply(' '.join)

In [231]:
bow_transcripts = bow_transcripts.to_frame()

In [232]:
speaker_dummies = pd.get_dummies(bow_transcripts.index)

In [233]:
vectorizer, vector, vector_pd = text_to_vector(bow_transcripts['Line'])

In [234]:
vector_pd

Unnamed: 0,000,00000702,00010101,00010336,000131,00013615,000219,000232,000243,000252,...,zach,zack,zealand,zelenski,zero,zest,zip,zippo,zone,zuckerberg
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,18,0,3,0,2,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,8,0,0,0,3,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [284]:
lda = LDA(n_components=5, n_jobs=1)
lda.fit(vector_pd)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=5, n_jobs=1,
                          perp_tol=0.1, random_state=None,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=0)

In [285]:
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words -1:-1]]))

In [286]:
print_topics(lda, vectorizer, 10)


Topic #0:
peopl presid right just make american countri think becaus way

Topic #1:
loom ambiti drum adequ refugees tip eaten 000609 broaden boo

Topic #2:
presid texa don law want folk plan trump immigr healthcar

Topic #3:
peopl think make know presid want right just becaus say

Topic #4:
war peopl countri american serv presid chang need like regim


#### Some of the most latent topics can be seen. In the top 5 topics, you can see a focus on certain issues such as immigration, healthcare, and war.

In [130]:
for i in range(1, 51, 5):
    lda = LDA(n_components = i)
    lda.fit(X_train)
    lda_weights = lda.transform(X_train)
    rf.fit(lda_weights, y_train)
    rf_pred = rf.predict(lda.transform(X_test))
    print(accuracy_score(y_test, rf_pred), precision_score(y_test, rf_pred), recall_score(y_test, rf_pred), i)

0.10265282583621683 0.10265282583621683 1.0 1
0.8788927335640139 0.22093023255813954 0.07116104868913857 6
0.8738946559015763 0.1411764705882353 0.0449438202247191 11
0.8512110726643599 0.125 0.0749063670411985 16
0.8604382929642446 0.14705882352941177 0.0749063670411985 21
0.8558246828143022 0.14473684210526316 0.08239700374531835 26
0.8542868127643214 0.1056338028169014 0.056179775280898875 31
0.8592848904267589 0.16778523489932887 0.09363295880149813 36
0.8585159554017685 0.16556291390728478 0.09363295880149813 41
0.8600538254517494 0.12403100775193798 0.0599250936329588 46


In [243]:
cosine_sim = cosine_similarity(vector_pd, vector_pd)

In [147]:
all_transcripts.Speaker.unique()

array(['speaker', 'warren', 'klobuchar', 'orourke', 'booker', 'castro',
       'gabbard', 'blasio', 'delaney', 'ryan', 'sanders', 'bennet',
       'biden', 'harris', 'hickenlooper', 'gillibrand', 'buttigieg',
       'yang', 'swalwell', 'bullock', 'williamson', 'inslee', 'steyer',
       'bloomberg', 'question'], dtype=object)

In [244]:
indices = pd.Series(bow_transcripts.index)

def recommendations(candidate, cosine_sim = cosine_sim):
    
    # initializing the empty list of similar candidates
    similar_candidate = []
    
    # gettin the index of the candidate that matches the title
    idx = indices[indices == candidate].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 3 most similar candidate
    top_3_indexes = list(score_series.iloc[1:4].index)
    
    # populating the list with the candidates
    for i in top_3_indexes:
        similar_candidate.append(list(bow_transcripts.index)[i])
        
    return similar_candidate

In [245]:
recommendations('booker', cosine_sim)

['buttigieg', 'harris', 'warren']

In [246]:
recommendations('sanders', cosine_sim)

['warren', 'buttigieg', 'klobuchar']

In [247]:
recommendations('biden', cosine_sim)

['klobuchar', 'buttigieg', 'warren']

In [248]:
recommendations('harris', cosine_sim)

['booker', 'warren', 'sanders']

In [249]:
recommendations('klobuchar', cosine_sim)

['buttigieg', 'warren', 'biden']

In [250]:
recommendations('yang', cosine_sim)

['booker', 'warren', 'buttigieg']

#### In general, the Candidates that have found success in the debates and in the primaries (as of February 27, 2020) are similar to each other in what they say. Candidates who have not performed as well share similarities with each other.