In [60]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
from gensim import corpora,models,similarities

## Load and Preview

In [5]:
# Load datasets
democracy_df = pd.read_csv('./democracy.csv')
republic_df = pd.read_csv('./republic.csv')

In [13]:
# Democracy basic info
democracy_df.head(2)

Unnamed: 0.1,Unnamed: 0,id,party_type,name,screen_name,location,description,protected,geo_enabled,verified,entities,statuses_count,has_extended_profile,followers_count,friends_count,favourites_count
0,0,904662446,d,Luz Marina Tarazona,Luzbendecida,Maracaibo Venezuela,Bendecida y prosperada en Cristo Jesús 😚🤗,False,True,False,False,4718,True,312,1523,2359
1,1,23614054,d,becky lou,ruatwitt,Former USA,Wife of Carter and proud mom of Emmitt. Lover ...,False,False,False,False,5132,False,236,460,20456


In [14]:
democracy_df.count()

Unnamed: 0              15077
id                      15077
party_type              15077
name                    15075
screen_name             15077
location                11856
description             13796
protected               15077
geo_enabled             15077
verified                15077
entities                15077
statuses_count          15077
has_extended_profile    15077
followers_count         15077
friends_count           15077
favourites_count        15077
dtype: int64

In [15]:
# Republic basic info
republic_df.head(2)

Unnamed: 0.1,Unnamed: 0,id,party_type,name,screen_name,location,description,protected,geo_enabled,verified,entities,statuses_count,has_extended_profile,followers_count,friends_count,favourites_count
0,0,53240401,r,Erik Dobell,erikdobell,"Tennessee, USA",Gatlinburg based Mentalist. See more at - http...,False,True,False,True,20360.0,True,7557.0,7916.0,5908.0
1,1,33698579,r,SAB,HuntBeach12,,Seahawks and Huskies forever,False,False,False,False,6502.0,True,268.0,740.0,5318.0


In [16]:
republic_df.count()

Unnamed: 0              15042
id                      15040
party_type              15040
name                    15039
screen_name             15040
location                10833
description             13424
protected               15038
geo_enabled             15038
verified                15038
entities                15036
statuses_count          15036
has_extended_profile    15036
followers_count         15036
friends_count           15036
favourites_count        15036
dtype: int64

In [41]:
# Count users have no description
print(len(democracy_df[democracy_df['description'].isna().values==True]))
print(len(republic_df[republic_df['description'].isna().values==True]))

1281
1618


In [42]:
# Remove those users
democracy_df = democracy_df[democracy_df['description'].isna().values==False]
republic_df = republic_df[republic_df['description'].isna().values==False]

## Text Processing

In [43]:
# Punctuations and Stopwords
english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%','-']
english_stopwords = stopwords.words('english')

In [46]:
# desc of d
desc_d = democracy_df['description'].tolist()
desc_d_tokenized = [word_tokenize(d) for d in desc_d]
# desc of r
desc_r = republic_df['description'].tolist()
desc_r_tokenized = [word_tokenize(d) for d in desc_r]

In [51]:
# Remove punctuations and stopwords
desc_d_filtered = [[word for word in document if not word in english_punctuations] for document in desc_d_tokenized]
desc_r_filtered = [[word for word in document if not word in english_punctuations] for document in desc_r_tokenized]
desc_d_filtered = [[word for word in document if not word in english_stopwords] for document in desc_d_filtered]
desc_r_filtered = [[word for word in document if not word in english_stopwords] for document in desc_r_filtered]

In [59]:
# Stemming
st = LancasterStemmer()
desc_d_stemmed = [[st.stem(word) for word in document] for document in desc_d_filtered]
desc_r_stemmed = [[st.stem(word) for word in document] for document in desc_r_filtered]

In [65]:
# Remove low frequncy words
all_stems_d = sum(desc_d_stemmed, [])
all_stems_r = sum(desc_r_stemmed, [])
stems_once_d = set(stem for stem in set(all_stems_d) if all_stems_d.count(stem) == 1)
stems_once_r = set(stem for stem in set(all_stems_r) if all_stems_r.count(stem) == 1)
texts_d = [[stem for stem in text if stem not in stems_once_d] for text in desc_d_stemmed]
texts_r = [[stem for stem in text if stem not in stems_once_r] for text in desc_r_stemmed]

In [67]:
# Form corpora
dic_d = corpora.Dictionary(desc_d_stemmed)
dic_r = corpora.Dictionary(desc_r_stemmed)

In [68]:
# Save model
dic_d.save('./dic_d.plk')
dic_r.save('./dic_r.pkl')

In [94]:
# Get tf
top100_d_id = sorted(dic_d.dfs.items(), key=lambda x: x[1], reverse=True)[:100]
top100_r_id = sorted(dic_r.dfs.items(), key=lambda x: x[1], reverse=True)[:100]
top100_d = []
top100_r = []
for i in top100_d_id:
    top100_d.append(list(dic_d.token2id)[i[0]])
for i in top100_r_id:
    top100_r.append(list(dic_r.token2id)[i[0]])

In [97]:
# Save most popular words
with open('top100_d.txt', 'w') as f:
    f.write(str(top100_d))
    f.close()
with open('top100_r.txt', 'w') as f:
    f.write(str(top100_r))
    f.close()