In [None]:
import numpy as np
import pandas as pd

from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import pos_tag
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk
#nltk.download()

In [None]:
wnl = WordNetLemmatizer()

def get_pos_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:         
        return None
    
def lemmatize(sentence):
    res = ""
    t = pos_tag(word_tokenize(sentence))
    for i in t:
        if (len(i[0]) < 2):
            continue
        if ('www.' in i[0]):
            continue
        if(get_pos_tag(i[1]) is None):
            res += i[0] + " "
        else:
            res += wnl.lemmatize(i[0], pos=get_pos_tag(i[1])) + " "
    return res.lower()[:-1]

lemmatize("Hes the greatest artrist I have seen today")

In [None]:
df = pd.read_pickle('../data/reddit_corpus_balanced_filtered.gzip', compression='gzip')

In [None]:
theme_groups = {}
theme_groups['SARS-CoV-2'] = ['r/CovidVaccinated', 'r/Masks4All', 'r/NoLockdownsNoMasks', 'r/EndTheLockdowns', 'r/COVID19', 'r/COVID19positive', 'r/CoronavirusCanada', 'r/CoronavirusRecession', 'r/CoronavirusUK', 'r/CoronavirusUS', 'r/Coronavirus', 'r/LockdownSkepticism', 'r/NoNewNormal']
theme_groups['Vaccines'] = ['r/CovidVaccinated', 'r/VACCINES', 'r/vaxxhappened', 'r/AntiVaxxers', 'r/antivax', 'r/TrueAntiVaccination', 'r/DebateVaccine', 'r/DebateVaccines']
theme_groups['Abortion'] = ['r/AskProchoice', 'r/prochoice', 'r/insaneprolife', 'r/prolife', 'r/ProLifeLibertarians', 'r/Abortiondebate', 'r/abortion']
theme_groups['womens-and-mens-rights'] = ['r/Feminism', 'r/feminisms', 'r/RadicalFeminism', 'r/RadicalFeminismUSA', 'r/MRActivism', 'r/MensRights', 'r/antifeminists', 'r/feminismformen', 'r/masculism', 'r/GenderCritical', 'r/Egalitarianism']
theme_groups['Gun-control'] = ['r/Firearms', 'r/GunsAreCool', 'r/liberalgunowners', 'r/progun', 'r/guncontrol', 'r/GunDebates', 'r/GunResearch', 'r/gunpolitics']
theme_groups['Climate-change'] = ['r/climateskeptics', 'r/GlobalClimateChange', 'r/climate', 'r/climatechange']
theme_groups['5G'] = ['r/5GDebate']
theme_groups['general-political-debate'] = ['r/JoeBiden', 'r/LeftistsForMen', 'r/Liberal', 'r/LockdownCriticalLeft', 'r/democrats', 'r/Conservative', 'r/ConservativesOnly', 'r/conservatives', 'r/Republican', 'r/RepublicanValues', 'r/politics', 'r/uspolitics', 'r/Impeach_Trump']

inverse_theme_groups = {}
for theme in theme_groups:
    for sub in theme_groups[theme]:
        inverse_theme_groups[sub] = theme

In [None]:
sub_reddit_post_dic = {}

for row in tqdm(df.iterrows(), total=len(df)):
    data = row[1]
    documents = data['documents']
    
    for doc_id, text, date, sub_reddit, labels in documents:
        if sub_reddit not in sub_reddit_post_dic:
            sub_reddit_post_dic[sub_reddit] = []
        sub_reddit_post_dic[sub_reddit].append(text)

In [None]:
M = []
for topic in theme_groups:
    res = []
    for sub in theme_groups[topic]:
        res.append(" ".join(sub_reddit_post_dic[sub]))
    M.append(" ".join(res))
len(M)

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1, 1))
X = vectorizer.fit_transform(M)
X_words = np.array(vectorizer.get_feature_names_out())

In [None]:
arr = np.array(X[0])
words = X_words[arr != 0]
arr = arr[arr != 0]
top_k_ind = heapq.nlargest(k, enumerate(arr), key=lambda x: x[1])
        
# separate the wrights and normalize them
top_k_w = [i[1] for i in top_k_ind]
top_k_w = list(np.array(top_k_w)/sum(top_k_w))

temp = {'words': [], 'weights': []}
for i in range(0, len(top_k_ind)):
    # transform to uppercase for uniform appearance
    temp['words'].append(words[top_k_ind[i][0]].upper())

    # round to 5 digits to save space in export
    temp['weights'].append(float(f'{top_k_w[i]:.5f}'))

temp