# Topic Modelling
(https://github.com/QuantCS109/TrumpTweets/blob/master/notebooks_features/topic_modelling.ipynb)

In [3]:
import sys
sys.path.append('..') #to add top-level to path


import numpy as np
from sklearn import preprocessing 
from sklearn.cluster import KMeans
import pandas as pd
from modules.project_helper import TweetData


### This notebook uses the Word2Vec features created in [Trump_Word2Vec](https://github.com/QuantCS109/TrumpTweets/blob/master/notebooks_features/Trump_Word2Vec.ipynb) and in [trump_word2vec_features](https://github.com/QuantCS109/TrumpTweets/blob/master/notebooks_features/trump_word2vec_features.ipynb)

Creating cluster model with 25 clusters.

In [4]:

num_clusters = 25

tweet_data = TweetData()
topics_df = tweet_data.clean_tweets[tweet_data.clean_tweets.after4_date >= pd.to_datetime('1-1-2017')]

emb = pd.read_csv('../data/intermediate_data/tweet_embeddings.csv',index_col=0)
X_Norm = preprocessing.normalize(np.array(emb))
kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(X_Norm)

topics_df['topic'] = kmeans.predict(X_Norm)

topics_df.groupby('topic').agg('count')

topics_analysis = pd.DataFrame()
topics_analysis['tweet_list'] = topics_df.tweets.str.split(' ')
topics_analysis['topic'] = topics_df['topic']
topics_analysis_melt = topics_analysis.explode('tweet_list')
topics_analysis_agg = topics_analysis_melt.assign(topic_count=1).groupby(['tweet_list','topic']).agg('count').reset_index()
all_count = topics_analysis_melt.groupby('tweet_list').agg(all_count=pd.NamedAgg('topic','count'))
topics_analysis_joined = topics_analysis_agg.join(all_count,on='tweet_list')
topics_analysis_joined['prop'] = topics_analysis_joined.topic_count/ topics_analysis_joined.all_count

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


# Aggregating topics at day level

We want to know how many times has trump tweeted about each 'topic' in a given day.

In [153]:
featues_df = topics_df\
        .groupby(['after4_date','topic'])['tweets']\
        .agg('count')\
        .reset_index()\
        .pivot(index='after4_date',columns='topic',values='tweets')\
        .fillna(0)\
        .apply(lambda x: x/sum(x),axis=1)
featues_df.index.name = 'date'
featues_df.head()

topic,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-01-02,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-01-03,0.0,0.222222,0.0,0.111111,0.222222,0.0,0.0,0.0,0.111111,0.0,...,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-01-04,0.0,0.214286,0.0,0.214286,0.142857,0.0,0.0,0.0,0.0,0.0,...,0.071429,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-01-05,0.0,0.0,0.0,0.0,0.333333,0.166667,0.0,0.0,0.0,0.0,...,0.166667,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [149]:
featues_df.to_csv(f'../data/features/topic_features_clusters={num_clusters}.csv')

## Topic Analysis

The code below prints a random sample of 10 tweets for each cluster.

In [154]:
def sample_print(df,count=5):
    np.random.seed(seed=0)
    samples = np.random.choice(len(df),10,replace=False)
    if (len(df)>0):
        for i in samples:
            print(df.tweets[i])
            print('')
    else:
        print('No tweets in cluster')
        
    

In [155]:
for cluster in range(num_clusters):
    print(f'Cluster {cluster}')
    print('')
    sample_print(topics_df[topics_df.topic==cluster])
    print('------------------------------------------------------------------------------')
    print('')

Cluster 0

congratulations 

congratulations to the tigers full ceremony 

congratulations to our new cia director gina haspel 

congratulations to the great jerry west 

congratulations to the class of 

congratulations to dana rohrabacher on his big california win we are proud of you dana 

big day for israel congratulations 

a great win for brooks congratulations to a great champion 

it was my great honor to deliver the at the congratulations to the class of 

congratulations to the philadelphia eagles on a great super bowl victory 

------------------------------------------------------------------------------

Cluster 1

mitch get back to work and put repeal amp replace tax reform amp cuts and a great infrastructure bill on my desk for signing you can do it 

it was a great day for the united states of america this is a great plan that is a repeal amp replace of obamacare 

i suggest that we add more dollars to healthcare and make it the best anywhere obamacare is dead the repub

ValueError: Cannot take a larger sample than population when 'replace=False'