<a href="https://colab.research.google.com/github/MartaCampagnoli/HateSpeechDetection/blob/main/No%20Output%20Notebooks/WordClouds_KeyBert_BertTopic_English_NoOutput.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install keybert
!pip install keyphrase-vectorizers
!pip install bertopic

In [None]:
from bertopic import BERTopic
import pandas as pd
from wordcloud import WordCloud, STOPWORDS
from google.colab import files
import io
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseTfidfVectorizer
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

In [None]:
uploaded = files.upload() #cleandata.csv
df = pd.read_csv(io.BytesIO(uploaded['cleandata.csv']))

# WordClouds

In [None]:
stopwords = set(STOPWORDS)
stopwords.update(["s"])

def wordcloud50(column):
  long_string = ','.join(list(column.values))
  wordcloud = WordCloud(background_color="black", max_words=50, contour_width=3, contour_color='steelblue', width=600, height=300, stopwords = stopwords)
  wordcloud.generate(long_string)
  w = wordcloud.to_image()
  return w

In [None]:
wordcloud50(df[df['label'] == 'hate']['text']) #hate

In [None]:
wordcloud50(df[df['label'] == 'nothate']['text'])

In [None]:
wordcloud50(df[df['target'] == 'wom']['text'])

In [None]:
wordcloud50(df[df['target'] == 'bla']['text'])

In [None]:
wordcloud50(df[df['target'] == 'jew']['text'])

In [None]:
wordcloud50(df[df['target'] == 'mus']['text'])

In [None]:
wordcloud50(df[df['target'] == 'trans']['text'])

In [None]:
wordcloud50(df[df['target'] == 'gay']['text'])

#Keybert

In [None]:
def keybertextract(column):
  long_string = ','.join(list(column.values))
  keywords = kw_model.extract_keywords(long_string, keyphrase_ngram_range=(1, 1))
  bigrams = kw_model.extract_keywords(long_string, keyphrase_ngram_range=(1, 2))
  tfidf = kw_model.extract_keywords(long_string, vectorizer=KeyphraseTfidfVectorizer())
  return keywords, bigrams, tfidf

In [None]:
kw_model = KeyBERT()

In [None]:
hatekeywords, hatebigrams, tfidfhate = keybertextract(df[df['label'] == 'hate']['text'])
print(f"Unigram Keywords:", hatekeywords)
print(f"Bigram Keywords:", hatebigrams)
print(f"TfIdf Keywords:", tfidfhate)

In [None]:
nohatekeywords, nohatebigrams, tfidfnohate = keybertextract(df[df['label'] == 'nothate']['text'])
print(f"Unigram Keywords:", nohatekeywords)
print(f"Bigram Keywords:", nohatebigrams)
print(f"TfIdf Keywords:", tfidfnohate)

In [None]:
womankeywords, womanbigrams, tfidfwom = keybertextract(df[df['target'] == 'wom']['text'])
print(f"Unigram Keywords:", womankeywords)
print(f"Bigram Keywords:", womanbigrams)
print(f"TfIdf Keywords:", tfidfwom)

In [None]:
blackpeoplekeywords, blackpeoplebigram, tfidfblackpeople = keybertextract(df[df['target'] == 'bla']['text'])
print(f"Unigram Keywords:", blackpeoplekeywords)
print(f"Bigram Keywords:", blackpeoplebigram)
print(f"TfIdf Keywords:", tfidfblackpeople)

In [None]:
jewishpeoplekeywords, jewishpeoplebigram, tfidfjewish = keybertextract(df[df['target'] == 'jew']['text'])
print(f"Unigram Keywords:", jewishpeoplekeywords)
print(f"Bigram Keywords:", jewishpeoplebigram)
print(f"TfIdf Keywords:", tfidfjewish)

In [None]:
muslimpeoplekeywords, muslimpeoplebigram, tfidfmuslim = keybertextract(df[df['target'] == 'mus']['text'])
print(f"Unigram Keywords:", muslimpeoplekeywords)
print(f"Bigram Keywords:", muslimpeoplebigram)
print(f"TfIdf Keywords:", tfidfmuslim)

In [None]:
transpeoplekeywords, transpeoplebigram, tfidftranspeople = keybertextract(df[df['target'] == 'trans']['text'])
print(f"Unigram Keywords:", transpeoplekeywords)
print(f"Bigram Keywords:", transpeoplebigram)
print(f"TfIdf Keywords:", tfidftranspeople)

In [None]:
gaypeoplekeywords, gaypeoplebigram, tfidfgaypeople = keybertextract(df[df['target'] == 'gay']['text'])
print(f"Unigram Keywords:", gaypeoplekeywords)
print(f"Bigram Keywords:", gaypeoplebigram)
print(f"TfIdf Keywords:", tfidfgaypeople)

#BerTopic

In [None]:
hatenew = hate.groupby('target', group_keys=False).apply(lambda x: x.sample(frac=0.2)) #resample proportionally to target class, saved locally

In [None]:
uploaded = files.upload() #hatenew.csv
hatenew = pd.read_csv(io.BytesIO(uploaded['hatenew.csv']))

In [None]:
data = hatenew['text'].values.tolist()

In [None]:
sentences = [sent_tokenize(piece) for piece in data]
sentences = [sentence for doc in sentences for sentence in doc]

# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(sentences, show_progress_bar=True)

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

hdbscan_model = HDBSCAN(min_cluster_size=50, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

In [None]:
topic_model_new = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=KeyphraseTfidfVectorizer(),

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

# Train model
topicsnew, probsnew = topic_model_new.fit_transform(sentences, embeddings)

In [None]:
# Show topics
topic_model_new.get_topic_info()

In [None]:
topic_labels = topic_model_new.generate_topic_labels(nr_words=1,topic_prefix=False,word_length=10,separator=", ")
topic_model_new.set_topic_labels(topic_labels)

In [None]:
topic_model_new.visualize_barchart(n_words=10, width=300, height=300, top_n_topics=14, custom_labels= True)

In [None]:
topic_model_new.visualize_topics()

In [None]:
similar_topics, similarity = topic_model_new.find_topics("woman", top_n=5)
topic_model_new.get_topic(similar_topics[0])

In [None]:
topic_model_new.save("my_model_3", serialization="pickle")