# Textual Analysis

This analysis focuses on the toxic comments, which have a value of the feature "toxicity" >= 0.7
<br>
<br>
Wordcloud extracts the most frequent words from these comments, TF-IDF identifies the most relevant words

In [None]:
import pandas as pd

#WordCloud
import nltk
from nltk.corpus import stopwords
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
nltk.download("popular")
stop_words = stopwords.words('english')

#TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
#csv file
df = pd.read_csv("../.csv")

Definition of toxic comments

In [None]:

def toxic(x):
    if x >= 0.7:
        return 1
    else:
        return 0
    
#binary feature: 1 if comment is toxic, 0 otherwise
df["toxic"] = df["toxicity"].apply(toxic)

<h2>WordCloud</h2>

In [None]:
df_tox = df[df["toxic"] == 1]

In [None]:
words_wc = []
#function that generates wordclouds, considering top 150.000 toxic comments
def generate_word_cloud(col_name):
    documents = df_tox.sort_values(by=col_name, ascending = False)
    documents_text = documents["text"][:150000].tolist()
    texts = " ".join(documents_text).lower()
    wc = WordCloud(
        max_font_size = 100,
        max_words = 200,
        background_color = 'white',
        stopwords = stop_words,
        font_path = "16020_FUTURAM.ttf"
    ).generate(texts)
    for k,v in wc.words_.items():
        #biggest words
        if v >= 0.5:
            words_wc.append(k)
    fig, ax = plt.subplots(1, 1, figsize = (10, 10))
    ax.set_title(f"{col_name.replace('_', ' ').title()}", fontsize = 20)
    ax.imshow(wc, interpolation = 'bilinear')
    ax.axis("off")
    plt.show()

In [None]:
generate_word_cloud("toxicity")

In [None]:
generate_word_cloud("severe_toxicity")

In [None]:
generate_word_cloud("obscene")

In [None]:
generate_word_cloud("threat")

In [None]:
generate_word_cloud("insult")

In [None]:
generate_word_cloud("identity_attack")

In [None]:
#set of toxic words
words_wc_set = set(words_wc)
len(words_wc_set)

<h2>TF-IDF</h2>

toxicity è quella coi valori più alti di correlazione, ordino il dataset per quella e prendo i top tossici

In [None]:
words_tfidf = []
#Identification of relevant words using TF-IDF
def tf_idf(colname):
    documents = df_tox.sort_values(by=colname, ascending = False)
    documents_text = documents["text"][:150000]
    vectorizer = TfidfVectorizer(stop_words = "english")
    tfidf_matrix = vectorizer.fit_transform(documents_text)
    feature_names = vectorizer.get_feature_names_out()
    tfidf_array = tfidf_matrix.toarray()
    tfidf_df = pd.DataFrame(tfidf_array, columns=feature_names)
    mean_tfidf = tfidf_df.mean(axis=0)
    sorted_mean_tfidf = mean_tfidf.sort_values(ascending=False)
    top_terms = sorted_mean_tfidf.head(10)
    for k, v in top_terms.items():
        words_tfidf.append(k)

In [None]:
toxicities = ["toxicity","obscene","insult","severe_toxicity","identity_attack","threat"]
for i in toxicities:
    tf_idf(i)

In [None]:
words_tfidf_set = set(words_tfidf)

In [None]:
words_union = words_tfidf_set | words_wc_set

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
#lemmatization of words
def lemmatize(w_set):
    words_lemma = []
    for i in w_set:
        doc = nlp(i)
        for token in doc:
            words_lemma.append(token.lemma_)
    return set(words_lemma)
    
wc_lemmas = lemmatize(words_union)


In [None]:
#every word is inserted as binary feature (1 if the word is present in comment, 0 otherwise)
for word in wc_lemmas:
    df[word] = df['text'].str.contains(word).astype(int)

In [None]:
#esport dataset
#df.to_csv("../file_csv/PIANO_comments.csv", index=False)