In [None]:
import numpy as np
import pandas as pd
import json

from tqdm.auto import tqdm

In [None]:
from nltk.stem.snowball import ItalianStemmer
from nltk import RegexpTokenizer
import nltk
nltk.download("stopwords")
stopwords = nltk.corpus.stopwords.words("italian")
stopwords.append("così")

def process_text(text):
  tokenized = RegexpTokenizer(r"\w+").tokenize(text)
  to_return = ""
  stem_word_dictionary = {}
  for token in tokenized:
    if (token in stopwords or len(token) < 4 or token.isnumeric()):
      continue
    stemmed = ItalianStemmer().stem(token)
    stem_word_dictionary[stemmed] = token.lower()
    to_return = to_return + stemmed + " "
  return (to_return[:-1], stem_word_dictionary)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
data = pd.read_csv("Comments_Data_1.csv", sep = "\t", encoding = "utf-8", index_col = 0)

for index in range(2, 9):
  other_data = pd.read_csv("Comments_Data_" + str(index) + ".csv", sep = "\t", encoding = "utf-8", index_col = 0)
  data = pd.concat([data, other_data], ignore_index = True)

del other_data

In [None]:
data

Unnamed: 0,Author,Submission ID,Text,Hour,Minute,Second,ID,Top Level,Parent ID,#Upvotes
0,Lord_TheJc,f7txjb,"Chiudo il thread, andate su quello nuovo\n\n/r...",12,25,40,fijqzau,True,f7txjb,1
1,EMOSCAMBIO,f7txjb,>Ore 19.30 - Il presunto «paziente 0» negativo...,18,52,58,figab7t,True,f7txjb,83
2,drstins_n,f7txjb,> Si allontana da Codogno e torna ad Avellino:...,21,40,54,fih8gxg,True,f7txjb,77
3,IceTea666,f7txjb,Dogana slovenia/croazia. Il poliziotto croato ...,22,17,8,fihft9k,True,f7txjb,75
4,panoramegamix,f7txjb,ho dato una veloce occhiata sui social \n\nNor...,17,52,2,fifwfbh,True,f7txjb,122
...,...,...,...,...,...,...,...,...,...,...
154651,magicobito,ggqrld,Grazie dello sbattone :),7,47,22,fq92i1z,False,fq7g6n9,1
154652,Jeremia19,ggqrld,Tamponi. Fonte: l’han fatto al padre di un mio...,16,41,53,fq6cl9k,False,fq6ci40,1
154653,frost_burg,ggqrld,"Tamponi, mi dicono.",18,39,33,fq6pgpr,False,fq6ci40,1
154654,Jkal91,ggqrld,"Ma sono le quattro del pomeriggio, che vai a d...",14,25,22,fq5ync4,False,fq5xxtp,4


In [None]:
tokenized_data = []
stem_word_dictionary = {}

for index in tqdm(range(len(data))):
  text, dictionary = process_text(data.iloc[index]["Text"])
  stem_word_dictionary.update(dictionary)
  tokenized_data.append(text)

HBox(children=(FloatProgress(value=0.0, max=154656.0), HTML(value='')))




In [None]:
data["Processed Text"] = tokenized_data
data

Unnamed: 0,Author,Submission ID,Text,Hour,Minute,Second,ID,Top Level,Parent ID,#Upvotes,Processed Text
0,Lord_TheJc,f7txjb,"Chiudo il thread, andate su quello nuovo\n\n/r...",12,25,40,fijqzau,True,f7txjb,1,chiud thread andat nuov italy comments f87rbo ...
1,EMOSCAMBIO,f7txjb,>Ore 19.30 - Il presunto «paziente 0» negativo...,18,52,58,figab7t,True,f7txjb,83,presunt pazient negat test coronavirus presunt...
2,drstins_n,f7txjb,> Si allontana da Codogno e torna ad Avellino:...,21,40,54,fih8gxg,True,f7txjb,77,allontan codogn torn avellin famigl quaranten ...
3,IceTea666,f7txjb,Dogana slovenia/croazia. Il poliziotto croato ...,22,17,8,fihft9k,True,f7txjb,75,dogan sloven croaz poliziott cro dov controll ...
4,panoramegamix,f7txjb,ho dato una veloce occhiata sui social \n\nNor...,17,52,2,fifwfbh,True,f7txjb,122,dat veloc occhi social nord mor centr mor aiut...
...,...,...,...,...,...,...,...,...,...,...,...
154651,magicobito,ggqrld,Grazie dello sbattone :),7,47,22,fq92i1z,False,fq7g6n9,1,graz sbatton
154652,Jeremia19,ggqrld,Tamponi. Fonte: l’han fatto al padre di un mio...,16,41,53,fq6cl9k,False,fq6ci40,1,tampon font fatt padr amic stat rifer
154653,frost_burg,ggqrld,"Tamponi, mi dicono.",18,39,33,fq6pgpr,False,fq6ci40,1,tampon dic
154654,Jkal91,ggqrld,"Ma sono le quattro del pomeriggio, che vai a d...",14,25,22,fq5ync4,False,fq5xxtp,4,quattr pomerigg dorm ospedal


In [None]:
data.to_csv("Comments_Data_Final.csv", encoding = "utf-8", sep = "\t")

In [None]:
with open("stem_word_dictionary.json", "w+") as file:
  json.dump(stem_word_dictionary, file)

In [None]:
reduced_data = data[data["Processed Text"].apply(lambda x : len(x.split(" "))) > 4]

# Latent Dirichlet Analysis

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from joblib import dump, load

reduced_CT = CountVectorizer().fit(reduced_data["Processed Text"])
CT = CountVectorizer().fit(data["Processed Text"])

In [None]:
dump(reduced_CT, "Reduced_Count_Vectorizer.joblib")
dump(CT, "Count_Vectorizer.joblib")

['Count_Vectorizer.joblib']

In [None]:
for n_topics in tqdm(range(2, 8)):
  documents_words_matrix = reduced_CT.transform(reduced_data["Processed Text"])
  reduced_LDA = LatentDirichletAllocation(n_components = n_topics).fit(documents_words_matrix)
  documents_words_matrix = CT.transform(data["Processed Text"])
  LDA = LatentDirichletAllocation(n_components = n_topics).fit(documents_words_matrix)
  dump(reduced_LDA, "Reduced_LDA_" + str(n_topics) + ".joblib")
  dump(LDA, "LDA_" + str(n_topics) + ".joblib")

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))




# Evaluation

In [None]:
import heapq
import time
import requests
from bs4 import BeautifulSoup as BS
import random

In [None]:
with open("stem_word_dictionary.json", "r") as file:
    stem_word_dict = json.load(file)

In [None]:
def get_top_words(lda_model, topic, N, inverse_vocab, stem_word_dict):
    top_words_stemmed = []
    for couple in heapq.nlargest(N, enumerate(lda_model.components_[topic]), lambda x : x[1]):
        top_words_stemmed.append(inverse_vocab[couple[0]])
    top_words_base = [stem_word_dict[x] for x in top_words_stemmed]
    return(top_words_base, top_words_stemmed)

In [None]:
def get_google_score(top_words_base, top_words_stemmed):
    to_return = 0
    query = "+" + "+".join(top_words_base)
    for index in range(0, 210, 10):
        page = requests.get(f"https://google.com/search?q=" + query + "&start=" + str(index))
        parser = BS(page.content, "html.parser")
        for title in parser.find_all("h3"):
            tokens = process_text(title.text)[0].split(" ")
            for word in top_words_stemmed:
                if(word in tokens):
                    to_return = to_return + 1
        time.sleep(3)
    return(to_return)

In [None]:
def get_scores(lda_model, N, inverse_vocab, stem_word_dict):
    to_return = []
    for topic in range(len(lda_model.components_)):
        base, stemmed = get_top_words(lda_model, topic, N, inverse_vocab, stem_word_dict)
        to_return.append(get_google_score(base, stemmed))
    return(to_return)

In [None]:
CT = load("Count_Vectorizer.joblib")
Reduced_CT = load("Reduced_Count_Vectorizer.joblib")

inverse_vocabulary = {value : key for key, value in CT.vocabulary_.items()}
reduced_inverse_vocabulary = {value : key for key, value in Reduced_CT.vocabulary_.items()}

In [None]:
to_save = {}

for index in tqdm(range(2, 8)):
    LDA = load("Reduced_LDA_" + str(index) + ".joblib")
    to_save[index] = get_scores(LDA, 10, inverse_vocabulary, stem_word_dict)
    
with open("TITLES_eval.json", "w+") as file:
    json.dump(to_save, file)

In [None]:
to_save = {}

for index in tqdm(range(2, 8)):
    LDA = load("LDA_" + str(index) + ".joblib")
    to_save[index] = get_scores(LDA, 10, reduced_inverse_vocabulary, stem_word_dict)
    
with open("Reduced_TITLES_eval.json", "w+") as file:
    json.dump(to_save, file)

In [None]:
def get_quiz(lda_model, inverse_vocabulary, stem_to_word_dict, dictionary):
  to_return = {}

  for topic in range(len(lda_model.components_)):
    top_words = get_top_words(lda_model, topic, 40, inverse_vocabulary, stem_to_word_dict)[0]
    correct_choices = random.sample(top_words, 15)
    wrong_choices = random.sample(dictionary, 3)
    to_return[topic] = []
    for index in range(3):
      to_append = [(word, "T") for word in correct_choices[index * 5: 5 * (index + 1)]]
      to_append.append((wrong_choices[index], "F"))
      to_return[topic].append(to_append)
  
  return (to_return)