In [139]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
from gensim import corpora,models,similarities

## Load and Preview

In [140]:
# Load datasets
democracy_df = pd.read_csv('./democracy.csv')
republic_df = pd.read_csv('./republic.csv')

In [141]:
# Democracy basic info
democracy_df.head(2)

Unnamed: 0.1,Unnamed: 0,id,party_type,name,screen_name,location,description,protected,geo_enabled,verified,entities,statuses_count,has_extended_profile,followers_count,friends_count,favourites_count
0,0,904662446,d,Luz Marina Tarazona,Luzbendecida,Maracaibo Venezuela,Bendecida y prosperada en Cristo Jesús 😚🤗,False,True,False,False,4718,True,312,1523,2359
1,1,23614054,d,becky lou,ruatwitt,Former USA,Wife of Carter and proud mom of Emmitt. Lover ...,False,False,False,False,5132,False,236,460,20456


In [142]:
democracy_df.count()

Unnamed: 0              15077
id                      15077
party_type              15077
name                    15075
screen_name             15077
location                11856
description             13796
protected               15077
geo_enabled             15077
verified                15077
entities                15077
statuses_count          15077
has_extended_profile    15077
followers_count         15077
friends_count           15077
favourites_count        15077
dtype: int64

In [143]:
# Republic basic info
republic_df.head(2)

Unnamed: 0.1,Unnamed: 0,id,party_type,name,screen_name,location,description,protected,geo_enabled,verified,entities,statuses_count,has_extended_profile,followers_count,friends_count,favourites_count
0,0,53240401,r,Erik Dobell,erikdobell,"Tennessee, USA",Gatlinburg based Mentalist. See more at - http...,False,True,False,True,20360.0,True,7557.0,7916.0,5908.0
1,1,33698579,r,SAB,HuntBeach12,,Seahawks and Huskies forever,False,False,False,False,6502.0,True,268.0,740.0,5318.0


In [144]:
republic_df.count()

Unnamed: 0              15042
id                      15040
party_type              15040
name                    15039
screen_name             15040
location                10833
description             13424
protected               15038
geo_enabled             15038
verified                15038
entities                15036
statuses_count          15036
has_extended_profile    15036
followers_count         15036
friends_count           15036
favourites_count        15036
dtype: int64

In [145]:
# Count users have no description
print(len(democracy_df[democracy_df['description'].isna().values==True]))
print(len(republic_df[republic_df['description'].isna().values==True]))

1281
1618


In [146]:
# Remove those users
democracy_df = democracy_df[democracy_df['description'].isna().values==False]
republic_df = republic_df[republic_df['description'].isna().values==False]

## Text Processing

In [151]:
# Punctuations and Stopwords
english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%','-', '|', '•', \
 'i', '...', '/', '’', '||', '//', '‘', '+', '~', '-']
english_stopwords = stopwords.words('english')

In [152]:
# desc of d
desc_d = democracy_df['description'].tolist()
desc_d_tokenized = [word_tokenize(d) for d in desc_d]
# desc of r
desc_r = republic_df['description'].tolist()
desc_r_tokenized = [word_tokenize(d) for d in desc_r]

In [153]:
# Remove punctuations and stopwords
desc_d_filtered = [[word for word in document if not word in english_punctuations] for document in desc_d_tokenized]
desc_r_filtered = [[word for word in document if not word in english_punctuations] for document in desc_r_tokenized]
desc_d_filtered = [[word for word in document if not word in english_stopwords] for document in desc_d_filtered]
desc_r_filtered = [[word for word in document if not word in english_stopwords] for document in desc_r_filtered]

In [154]:
# Stemming
st = LancasterStemmer()
desc_d_stemmed = [[st.stem(word) for word in document] for document in desc_d_filtered]
desc_r_stemmed = [[st.stem(word) for word in document] for document in desc_r_filtered]

In [155]:
# Remove low frequncy words
all_stems_d = sum(desc_d_stemmed, [])
all_stems_r = sum(desc_r_stemmed, [])
stems_once_d = set(stem for stem in set(all_stems_d) if all_stems_d.count(stem) == 1 or len(stem) < 4)
stems_once_r = set(stem for stem in set(all_stems_r) if all_stems_r.count(stem) == 1 or len(stem) < 4)
texts_d = [[stem for stem in text if stem not in stems_once_d] for text in desc_d_stemmed]
texts_r = [[stem for stem in text if stem not in stems_once_r] for text in desc_r_stemmed]

In [158]:
# Form corpora
dic_d = corpora.Dictionary(texts_d)
dic_r = corpora.Dictionary(texts_r)

In [159]:
# Save model
dic_d.save('./dic_d.plk')
dic_r.save('./dic_r.pkl')

In [166]:
# Get tf
# top100_d_id = sorted(dic_d.dfs.items(), key=lambda x: x[1], reverse=True)[:100]
# top100_r_id = sorted(dic_r.dfs.items(), key=lambda x: x[1], reverse=True)[:100]
# top100_d = []
# top100_r = []
# for i in top100_d_id:
#     top100_d.append(list(dic_d.token2id)[i[0]])
# for i in top100_r_id:
#     top100_r.append(list(dic_r.token2id)[i[0]])

In [161]:
# Tfidf
dic_d_corpus = [dic_d.doc2bow(text) for text in desc_d_stemmed]
tfidf_d = models.TfidfModel(dic_d_corpus)
dic_r_corpus = [dic_r.doc2bow(text) for text in desc_r_stemmed]
tfidf_r = models.TfidfModel(dic_r_corpus)

In [162]:
# Save model
tfidf_d.save('./tfidf_d.pkl')
tfidf_r.save('./tfidf_r.pkl')

In [163]:
res_d = {}
res_r = {}

In [164]:
for d in texts_d:
    weights = tfidf_d[dic_d.doc2bow(d)]
    weights = sorted(weights, key=lambda x: x[1], reverse=True)[:3]
    for item in weights:
        if item[0] in res_d.keys():
            res_d[item[0]] += 1
        else:
            res_d[item[0]] = 1
            
for d in texts_r:
    weights = tfidf_r[dic_r.doc2bow(d)]
    weights = sorted(weights, key=lambda x: x[1], reverse=True)[:3]
    for item in weights:
        if item[0] in res_r.keys():
            res_r[item[0]] += 1
        else:
            res_r[item[0]] = 1

In [165]:
sorted_weights_d = sorted(res_d.items(), key=lambda x: x[1], reverse=True)
sorted_weights_r = sorted(res_r.items(), key=lambda x: x[1], reverse=True)

In [167]:
top100_d = []
top100_r = []

In [168]:
for item in sorted_weights_d[:100]:
    top100_d.append(dic_d[item[0]])
for item in sorted_weights_r[:100]:
    top100_r.append(dic_r[item[0]])

In [169]:
# Save most popular words
with open('top100_d.txt', 'w') as f:
    f.write(str(top100_d))
    f.close()
with open('top100_r.txt', 'w') as f:
    f.write(str(top100_r))
    f.close()

## Put Democracy & Republic Together

In [174]:
total_df = pd.concat([democracy_df, republic_df])

In [175]:
total_df = total_df[total_df['description'].isna().values==False]

In [176]:
# Punctuations and Stopwords
english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%','-', '|', '•', \
 'i', '...', '/', '’', '||', '//', '‘', '+', '~', '-']
english_stopwords = stopwords.words('english')

In [177]:
# desc of total
desc_t = total_df['description'].tolist()
desc_t_tokenized = [word_tokenize(d) for d in desc_t]

In [178]:
# Remove punctuations and stopwords
desc_t_filtered = [[word for word in document if not word in english_punctuations] for document in desc_t_tokenized]
desc_t_filtered = [[word for word in document if not word in english_stopwords] for document in desc_t_filtered]

In [179]:
# Stemming
st = LancasterStemmer()
desc_t_stemmed = [[st.stem(word) for word in document] for document in desc_t_filtered]

In [180]:
# Remove low frequncy words
all_stems_t = sum(desc_t_stemmed, [])
stems_once_t = set(stem for stem in set(all_stems_t) if all_stems_t.count(stem) == 1 or len(stem) < 4)
texts_t = [[stem for stem in text if stem not in stems_once_t] for text in desc_t_stemmed]

In [181]:
# Form corpora
dic_t = corpora.Dictionary(texts_t)
# Save model
dic_t.save('./dic_t.plk')

In [182]:
# Tfidf
dic_t_corpus = [dic_t.doc2bow(text) for text in desc_t_stemmed]
tfidf = models.TfidfModel(dic_t_corpus)

In [183]:
# Save model
tfidf.save('./tfidf_t.pkl')

In [184]:
res = {}

In [185]:
for d in texts_t:
    weights = tfidf[dic_t.doc2bow(d)]
    weights = sorted(weights, key=lambda x: x[1], reverse=True)[:3]
    for item in weights:
        if item[0] in res.keys():
            res[item[0]] += 1
        else:
            res[item[0]] = 1

In [186]:
#res

In [187]:
sorted_weights = sorted(res.items(), key=lambda x: x[1], reverse=True)

In [188]:
top100 = []

In [189]:
for item in sorted_weights[:100]:
    top100.append(dic_t[item[0]])

In [190]:
top100

['https',
 'trump',
 'resist',
 'follow',
 'support',
 'trump2020',
 'proud',
 'polit',
 'wwg1wga',
 'presid',
 'americ',
 'patriot',
 'famy',
 'country',
 '2020',
 'christian',
 'conserv',
 'democr',
 'just',
 'happy',
 'real',
 'believ',
 'twit',
 'moth',
 'bless',
 'grandmoth',
 'good',
 'work',
 'account',
 'constitut',
 'republ',
 'writ',
 'husband',
 'know',
 'world',
 'right',
 'first',
 'journ',
 'stand',
 'kag2020',
 'thing',
 'peopl',
 'back',
 'qanon',
 'army',
 'marry',
 'depl',
 'think',
 'design',
 'retir',
 'friend',
 'stat',
 'busy',
 'fath',
 'best',
 'univers',
 'memb',
 'freedom',
 'everyth',
 'dream',
 'pleas',
 'want',
 'hard',
 'fight',
 'unit',
 'alway',
 'beauty',
 'anim',
 'direct',
 'govern',
 'keep',
 'americafirst',
 'grandm',
 'lord',
 'market',
 'donald',
 'opin',
 'person',
 'year',
 'look',
 'draintheswamp',
 'profess',
 'entrepr',
 'technolog',
 'produc',
 'prol',
 'nurs',
 'trumptrain',
 'buildthewal',
 'gmail.com',
 'form',
 'retweet',
 'tweet',
 'chr

In [191]:
with open('./top100_t.txt', 'w') as f:
    f.write(str(top100))
    f.close()