<a href="https://colab.research.google.com/github/MartaCampagnoli/HateSpeechDetection/blob/main/No%20Output%20Notebooks/KeyBert_BertTopic_French_NoOutput.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install keybert
!pip install keyphrase-vectorizers
!pip install bertopic

In [None]:
from bertopic import BERTopic
import pandas as pd
from wordcloud import WordCloud
from google.colab import files
import io
import re
import string
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseTfidfVectorizer
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
nltk.download('stopwords')
stop_words = stopwords.words('french')

In [None]:
uploaded = files.upload() #fr_dataset.csv

In [None]:
df = pd.read_csv(io.BytesIO(uploaded['fr_dataset.csv']))

In [None]:
def clean_tweet(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\[.*?\]', '', text)
    text = text.encode('ascii', 'ignore').decode('ascii')
    text = re.sub(' RT ', '', text)
    text = re.sub('RT', '', text)
    text = re.sub('rt', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def preprocess(text):
    text = text.lower()
    text = ''.join([word for word in text if word not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)


df['tweet'] = df['tweet'].astype(str).apply(clean_tweet)
df['tweet'] = df['tweet'].astype(str).apply(preprocess)

#Keybert

In [None]:
def keybertextract(column):
  long_string = ','.join(list(column.values))
  keywords = kw_model.extract_keywords(long_string, keyphrase_ngram_range=(1, 1))
  bigrams = kw_model.extract_keywords(long_string, keyphrase_ngram_range=(1, 2))
  tfidf = kw_model.extract_keywords(long_string, vectorizer=KeyphraseTfidfVectorizer())
  return keywords, bigrams, tfidf

In [None]:
kw_model = KeyBERT()

In [None]:
genkeywords, genbigrams, tfidfgen = keybertextract(df['tweet'])
print(f"Unigram Keywords:", genkeywords)
print(f"Bigram Keywords:", genbigrams)
print(f"TfIdf Keywords:", tfidfgen)

In [None]:
indkeywords, indbigram, tfidfind = keybertextract(df[df['group'] == 'individual']['tweet'])
print(f"Unigram Keywords:", indkeywords)
print(f"Bigram Keywords:", indbigram)
print(f"TfIdf Keywords:", tfidfind)

In [None]:
othkeywords, othbigram, tfidfoth = keybertextract(df[df['group'] == 'other']['tweet'])
print(f"Unigram Keywords:", othkeywords)
print(f"Bigram Keywords:", othbigram)
print(f"TfIdf Keywords:", tfidfoth)

In [None]:
adkeywords, adbigram, tfidfad = keybertextract(df[df['group'] == 'african_descent']['tweet'])
print(f"Unigram Keywords:", adkeywords)
print(f"Bigram Keywords:", adbigram)
print(f"TfIdf Keywords:", tfidfad)

In [None]:
arkeywords, arbigram, tfidfar = keybertextract(df[df['group'] == 'arabs']['tweet'])
print(f"Unigram Keywords:", arkeywords)
print(f"Bigram Keywords:", arbigram)
print(f"TfIdf Keywords:", tfidfar)

#BerTopic: suggested pipeline

In [None]:
data = df['tweet'].values.tolist()

In [None]:
sentences = [sent_tokenize(piece) for piece in data]
sentences = [sentence for doc in sentences for sentence in doc]
# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(sentences, show_progress_bar=True)
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=50, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

In [None]:
topic_model_new = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=KeyphraseTfidfVectorizer(),

  # Hyperparameters
  top_n_words=10,
  verbose=True,
  language = 'french'
)

# Train model
topicsnew, probsnew = topic_model_new.fit_transform(sentences, embeddings)

In [None]:
# Show topics
topic_model_new.get_topic_info()

In [None]:
topic_labels = topic_model_new.generate_topic_labels(nr_words=1,topic_prefix=False,word_length=10,separator=", ")
topic_model_new.set_topic_labels(topic_labels)

In [None]:
topic_model_new.visualize_barchart(n_words=8, width=500, height=500, top_n_topics=11, custom_labels= True)

In [None]:
topic_model_new.visualize_topics()

In [None]:
similar_topics, similarity = topic_model_new.find_topics("woman", top_n=5)
topic_model_new.get_topic(similar_topics[0])