<a href="https://colab.research.google.com/github/MartaCampagnoli/HateSpeechDetection/blob/main/Output%20Notebooks/KeyBert_BertTopic_French.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install keybert
!pip install keyphrase-vectorizers
!pip install bertopic

In [None]:
from bertopic import BERTopic
import pandas as pd
from wordcloud import WordCloud
from google.colab import files
import io
import re
import string
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseTfidfVectorizer
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
nltk.download('stopwords')
stop_words = stopwords.words('french')

In [None]:
uploaded = files.upload() #fr_dataset.csv

In [None]:
df = pd.read_csv(io.BytesIO(uploaded['fr_dataset.csv']))

In [None]:
def clean_tweet(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\[.*?\]', '', text)
    text = text.encode('ascii', 'ignore').decode('ascii')
    text = re.sub(' RT ', '', text)
    text = re.sub('RT', '', text)
    text = re.sub('rt', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def preprocess(text):
    text = text.lower()
    text = ''.join([word for word in text if word not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)


df['tweet'] = df['tweet'].astype(str).apply(clean_tweet)
df['tweet'] = df['tweet'].astype(str).apply(preprocess)

#Keybert

In [None]:
def keybertextract(column):
  long_string = ','.join(list(column.values))
  keywords = kw_model.extract_keywords(long_string, keyphrase_ngram_range=(1, 1))
  bigrams = kw_model.extract_keywords(long_string, keyphrase_ngram_range=(1, 2))
  tfidf = kw_model.extract_keywords(long_string, vectorizer=KeyphraseTfidfVectorizer())
  return keywords, bigrams, tfidf

In [None]:
kw_model = KeyBERT()

In [None]:
genkeywords, genbigrams, tfidfgen = keybertextract(df['tweet'])
print(f"Unigram Keywords:", genkeywords)
print(f"Bigram Keywords:", genbigrams)
print(f"TfIdf Keywords:", tfidfgen)

Unigram Keywords: [('mongolarrive', 0.4856), ('tunisiens', 0.4672), ('arabiesaoudite', 0.4595), ('tunisienne', 0.4473), ('arabesmerdes', 0.4452)]
Bigram Keywords: [('vraiment arabe', 0.5595), ('mongol vraiment', 0.5485), ('vraiment mongol', 0.5474), ('mongol entre', 0.5383), ('juste mongol', 0.532)]
TfIdf Keywords: [('rien voir juste probleme cadre mongol', 0.6123), ('oui pck tmavais pris mongol', 0.6084), ('adversaire dplace espece mongol apres', 0.6026), ('vraiment voix mongol', 0.5937), ('tont exclu parce quon voyait trop oreilles mongol', 0.5933)]


In [None]:
indkeywords, indbigram, tfidfind = keybertextract(df[df['group'] == 'individual']['tweet'])
print(f"Unigram Keywords:", indkeywords)
print(f"Bigram Keywords:", indbigram)
print(f"TfIdf Keywords:", tfidfind)

Unigram Keywords: [('mongole', 0.5606), ('mongolito', 0.5311), ('mongolece', 0.5294), ('mongol', 0.5274), ('mongolien', 0.5223)]
Bigram Keywords: [('doit mongol', 0.6572), ('mongol connaissait', 0.6559), ('traite mongol', 0.6455), ('mongol tveu', 0.6421), ('juste mongol', 0.6391)]
TfIdf Keywords: [('pense mongol non stephanois', 0.6747), ('lis tweet espce mongol avant ouvrir vieille gueule', 0.672), ('frere arrete faire mongol clasico con quoi', 0.6699), ('dit mongolien mongol tte noeud', 0.6632), ('dguisement chien mongol', 0.6597)]


In [None]:
othkeywords, othbigram, tfidfoth = keybertextract(df[df['group'] == 'other']['tweet'])
print(f"Unigram Keywords:", othkeywords)
print(f"Bigram Keywords:", othbigram)
print(f"TfIdf Keywords:", tfidfoth)

Unigram Keywords: [('vrais', 0.4201), ('politiques', 0.4183), ('militantiste', 0.413), ('communiste', 0.4047), ('communisme', 0.3944)]
Bigram Keywords: [('lavoement racistes', 0.5672), ('franaise terrorisme', 0.5443), ('odieux ngationnistes', 0.5285), ('macron terrorisme', 0.5246), ('moins terrorisme', 0.5221)]
TfIdf Keywords: [('hommage terroriste anti franais vraiment dbile profond', 0.5909), ('quand fait choc civilisations base lutte contre terrorisme', 0.5744), ('offense terrorisme voir dirigeants font rien quant monte', 0.5572), ('vraiment voix mongol', 0.5395), ('terrorisme intellectuel encore beaux jours devant navrant', 0.537)]


In [None]:
adkeywords, adbigram, tfidfad = keybertextract(df[df['group'] == 'african_descent']['tweet'])
print(f"Unigram Keywords:", adkeywords)
print(f"Bigram Keywords:", adbigram)
print(f"TfIdf Keywords:", tfidfad)

Unigram Keywords: [('renois', 0.5324), ('renoistu', 0.5208), ('renoi', 0.4924), ('mongolarrive', 0.3983), ('saiment', 0.387)]
Bigram Keywords: [('renois vraiment', 0.6559), ('vraiment renois', 0.6558), ('renois saiment', 0.6302), ('renois fait', 0.6193), ('renois maiment', 0.6182)]
TfIdf Keywords: [('renois saiment rellement', 0.6451), ('aimer trop courir quand rigolez renois', 0.6423), ('tout renois dise', 0.6241), ('pute haine renois plains quand renoi traite', 0.6155), ('renois tout renois', 0.6148)]


In [None]:
arkeywords, arbigram, tfidfar = keybertextract(df[df['group'] == 'arabs']['tweet'])
print(f"Unigram Keywords:", arkeywords)
print(f"Bigram Keywords:", arbigram)
print(f"TfIdf Keywords:", tfidfar)

Unigram Keywords: [('arabe', 0.5637), ('arabes', 0.498), ('arabi', 0.4879), ('arabie', 0.4772), ('arabo', 0.47)]
Bigram Keywords: [('sale arabe', 0.6801), ('arabe sale', 0.6684), ('sales arabe', 0.6651), ('arabes sale', 0.6541), ('arabe relve', 0.6347)]
TfIdf Keywords: [('repasse vtements sale arabe fais', 0.7032), ('exemple sale arabe merde', 0.7018), ('mddrrrr parle mieux sale arabe', 0.7013), ('dire sale arabe', 0.6957), ('rue vient traiter sale arabe arabe', 0.6956)]


#BerTopic: suggested pipeline

In [None]:
data = df['tweet'].values.tolist()

In [None]:
sentences = [sent_tokenize(piece) for piece in data]
sentences = [sentence for doc in sentences for sentence in doc]
# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(sentences, show_progress_bar=True)
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=50, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

In [None]:
topic_model_new = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=KeyphraseTfidfVectorizer(),

  # Hyperparameters
  top_n_words=10,
  verbose=True,
  language = 'french'
)

# Train model
topicsnew, probsnew = topic_model_new.fit_transform(sentences, embeddings)

In [None]:
# Show topics
topic_model_new.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,654,-1_ching chong_renois_rebeus_attard,"[ching chong, renois, rebeus, attard, font, ga...","[mdr jrigolai chinoi ching chong, passage mari..."
1,0,663,0_attard_attarde_cet attard_contre,"[attard, attarde, cet attard, contre, tre atta...","[attard moroni, mathilde normal bien tte intel..."
2,1,636,1_mongol_mongol mongol_gros_fait,"[mongol, mongol mongol, gros, fait, comme, vra...","[mongol, rpond propre tweet mongol, voit gres ..."
3,2,473,2_arabe_sale_sale arabe_arabes,"[arabe, sale, sale arabe, arabes, noirs, noir,...",[zizou sale arabe dteste cest possible prends ...
4,3,415,3_gauchiste_gauchistes_gauche_pauvres,"[gauchiste, gauchistes, gauche, pauvres, etre,...","[frise totalitarisme syndicat gauchiste quoi, ..."
5,4,322,4_renois_tous renois_meufs_ya,"[renois, tous renois, meufs, ya, aiment trop, ...","[grace dieu finie verra renois rebeux blonds, ..."
6,5,277,5_terrorisme_islam_terroristes_gauchiste,"[terrorisme, islam, terroristes, gauchiste, fr...","[islam gt islamisme gt terrorisme, islam inter..."
7,6,244,6_migrants_migrants africains_rfugis migrants_...,"[migrants, migrants africains, rfugis migrants...",[appelle cela vague dferlantes 2 migrants clan...
8,7,141,7_violence_peu_trop violence_lutter contre vio...,"[violence, peu, trop violence, lutter contre v...",[cole vandalise marseille marseille ecole viol...
9,8,135,8_rebeus_tous rebeus_jai_faire diffrence cultu...,"[rebeus, tous rebeus, jai, faire diffrence cul...","[heuuuu copines rebeus grandes pinces, sinon b..."


In [None]:
topic_labels = topic_model_new.generate_topic_labels(nr_words=1,topic_prefix=False,word_length=10,separator=", ")
topic_model_new.set_topic_labels(topic_labels)

In [None]:
topic_model_new.visualize_barchart(n_words=8, width=500, height=500, top_n_topics=11, custom_labels= True)

In [None]:
topic_model_new.visualize_topics()

In [None]:
similar_topics, similarity = topic_model_new.find_topics("woman", top_n=5)
topic_model_new.get_topic(similar_topics[0])

[('ching chong', 0.05878491649272215),
 ('renois', 0.04935253985199288),
 ('rebeus', 0.04558022422204156),
 ('attard', 0.04476479412080349),
 ('font', 0.036026691417741966),
 ('gauchiste', 0.03239211709427859),
 ('contre', 0.03024194674380739),
 ('contre avoement', 0.025924646282354774),
 ('adolescent attard', 0.02431384417182657),
 ('comme', 0.023137339011500536)]