In [None]:
import numpy as np
import pandas as pd
import pickle
import heapq
import matplotlib.pyplot as plt

from wordcloud import WordCloud
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import pos_tag
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk
from nltk.corpus import words
from nltk.corpus import stopwords
#nltk.download()

In [None]:
wnl = WordNetLemmatizer()
correct_words = set(words.words())
stop_words = set(stopwords.words('english'))

def get_pos_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:         
        return None
    
def lemmatize(sentence):
    res = ""
    t = pos_tag(word_tokenize(sentence.lower()))
    for i in t:
        if i[0] in stop_words:
            continue
        if get_pos_tag(i[1]) is None:
            if i[0] in correct_words:
                res += i[0] + " "
        else:
            lem = wnl.lemmatize(i[0], pos=get_pos_tag(i[1]))
            if (lem in correct_words):
                res += lem + " "
    return res[:-1]

lemmatize("university hes the greatest, artist i have seen today")

In [None]:
df = pd.read_pickle('../data/reddit_corpus_balanced_filtered.gzip', compression='gzip')

In [None]:
theme_groups = {}
theme_groups['SARS-CoV-2'] = ['r/CovidVaccinated', 'r/Masks4All', 'r/NoLockdownsNoMasks', 'r/EndTheLockdowns', 'r/COVID19', 'r/COVID19positive', 'r/CoronavirusCanada', 'r/CoronavirusRecession', 'r/CoronavirusUK', 'r/CoronavirusUS', 'r/Coronavirus', 'r/LockdownSkepticism', 'r/NoNewNormal']
theme_groups['Vaccines'] = ['r/CovidVaccinated', 'r/VACCINES', 'r/vaxxhappened', 'r/AntiVaxxers', 'r/antivax', 'r/TrueAntiVaccination', 'r/DebateVaccine', 'r/DebateVaccines']
theme_groups['Abortion'] = ['r/AskProchoice', 'r/prochoice', 'r/insaneprolife', 'r/prolife', 'r/ProLifeLibertarians', 'r/Abortiondebate', 'r/abortion']
theme_groups['womens-and-mens-rights'] = ['r/Feminism', 'r/feminisms', 'r/RadicalFeminism', 'r/RadicalFeminismUSA', 'r/MRActivism', 'r/MensRights', 'r/antifeminists', 'r/feminismformen', 'r/masculism', 'r/GenderCritical', 'r/Egalitarianism']
theme_groups['Gun-control'] = ['r/Firearms', 'r/GunsAreCool', 'r/liberalgunowners', 'r/progun', 'r/guncontrol', 'r/GunDebates', 'r/GunResearch', 'r/gunpolitics']
theme_groups['Climate-change'] = ['r/climateskeptics', 'r/GlobalClimateChange', 'r/climate', 'r/climatechange']
theme_groups['5G'] = ['r/5GDebate']
theme_groups['general-political-debate'] = ['r/JoeBiden', 'r/LeftistsForMen', 'r/Liberal', 'r/LockdownCriticalLeft', 'r/democrats', 'r/Conservative', 'r/ConservativesOnly', 'r/conservatives', 'r/Republican', 'r/RepublicanValues', 'r/politics', 'r/uspolitics', 'r/Impeach_Trump']

inverse_theme_groups = {}
for theme in theme_groups:
    for sub in theme_groups[theme]:
        inverse_theme_groups[sub] = theme

In [None]:
dates = []

for row in tqdm(df.iterrows(), total=len(df)):
    data = row[1]
    documents = data['documents']
    
    for doc_id, text, date, sub_reddit, labels in documents:
        dates.append(date.timestamp())

times = np.array(sorted(dates))

In [None]:
times[-1] - times[0]

In [None]:
# with open('lemma_post_dic.pkl', 'wb') as f:
#     pickle.dump(sub_reddit_post_dic, f)

In [None]:
with open('lemma_post_dic.pkl', 'rb') as f:
    sub_reddit_post_dic = pickle.load(f)

In [None]:
M = []
for topic in theme_groups:
    res = []
    for sub in theme_groups[topic]:
        res.append(" ".join(sub_reddit_post_dic[sub]))
    M.append(" ".join(res))
len(M)

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1, 1), min_df=3, stop_words='english')
X = vectorizer.fit_transform(M)
X_words = np.array(vectorizer.get_feature_names_out())
print(len(X_words))
X

In [None]:
k = 50

topic_words = {}

for ind, topic in enumerate(theme_groups):
    arr = X[ind].toarray()[0]
    words = X_words[arr != 0]
    arr = arr[arr != 0]
    top_k_ind = heapq.nlargest(k, enumerate(arr), key=lambda x: x[1])

    # separate the wrights and normalize them
    top_k_w = [i[1] for i in top_k_ind]
    top_k_w = list(np.array(top_k_w)/sum(top_k_w))

    topic_words[topic] = {}
    for i in range(0, len(top_k_ind)):
        # transform to uppercase for uniform appearance
        topic_words[topic][words[top_k_ind[i][0]].upper()] = top_k_ind[i][1]
        
        #topic_words[topic]['weights'].append()

#topic_words

In [None]:
# Generate the cloud

wc = WordCloud(width=1900,height=1000,relative_scaling=0.9,background_color='white',max_font_size = 2000)
for topic in theme_groups:
    cloud = wc.generate_from_frequencies(topic_words[topic])
    plt.figure(figsize=(16,9))
   
    plt.xticks([])
    plt.yticks([])
    plt.title(topic, fontsize=70)
    plt.box = False
    plt.imshow(cloud)
    plt.savefig('./categories/' + topic + '_cloud.pdf', bbox_inches='tight')