# TP Text Mining 

## Charger le jeu de données

In [1]:
import pandas as pd

df = pd.read_csv('youtoxic_english_1000.csv')
df.head()
print(df["Text"])

0      If only people would just take a step back and...
1      Law enforcement is not trained to shoot to app...
2      \nDont you reckon them 'black lives matter' ba...
3      There are a very large number of people who do...
4      The Arab dude is absolutely right, he should h...
                             ...                        
995    I remember that they sent in the national defe...
996    Stats don`t represent the problem. Race baitin...
997    The quote from the mother... Wow that hit hard...
998                              this video is so racist
999        God, the narrator has such an annoying lisp. 
Name: Text, Length: 1000, dtype: object


# Nettoyage du texte

## suppression de ponctuation 


In [2]:
import pandas as pd
import string

def remove_punctuation(text):
    text = ''.join([char for char in text if char not in string.punctuation])
    return text

text= df['Text'].apply(remove_punctuation)
print(text)

0      If only people would just take a step back and...
1      Law enforcement is not trained to shoot to app...
2      \nDont you reckon them black lives matter bann...
3      There are a very large number of people who do...
4      The Arab dude is absolutely right he should ha...
                             ...                        
995    I remember that they sent in the national defe...
996    Stats dont represent the problem Race baiting ...
997    The quote from the mother Wow that hit hard Ve...
998                              this video is so racist
999          God the narrator has such an annoying lisp 
Name: Text, Length: 1000, dtype: object


## Convertir en minuscules

In [3]:
import pandas as pd

def convert_to_lowercase(text):
    return text.lower()

text1 = text.apply(convert_to_lowercase)
print(text1)

0      if only people would just take a step back and...
1      law enforcement is not trained to shoot to app...
2      \ndont you reckon them black lives matter bann...
3      there are a very large number of people who do...
4      the arab dude is absolutely right he should ha...
                             ...                        
995    i remember that they sent in the national defe...
996    stats dont represent the problem race baiting ...
997    the quote from the mother wow that hit hard ve...
998                              this video is so racist
999          god the narrator has such an annoying lisp 
Name: Text, Length: 1000, dtype: object


## Suppression des mots vides

Suppression des mots vides à l'aide d'une liste de mots vides standard disponible dans NLTK pour l'anglais

In [4]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    return text

text2 = text1.apply(remove_stopwords)
print(text2)

0      people would take step back make case wasnt an...
1      law enforcement trained shoot apprehend traine...
2      dont reckon black lives matter banners held wh...
3      large number people like police officers calle...
4      arab dude absolutely right shot 6 extra time s...
                             ...                        
995                       remember sent national defence
996    stats dont represent problem race baiting atti...
997                   quote mother wow hit hard accurate
998                                         video racist
999                           god narrator annoying lisp
Name: Text, Length: 1000, dtype: object


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\x1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
## supprimer les liens 

In [6]:
import pandas as pd
import re

def remove_urls(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'www\S+', '', text)
    return text

text3 = text2.apply(remove_urls)
print(text3)

0      people would take step back make case wasnt an...
1      law enforcement trained shoot apprehend traine...
2      dont reckon black lives matter banners held wh...
3      large number people like police officers calle...
4      arab dude absolutely right shot 6 extra time s...
                             ...                        
995                       remember sent national defence
996    stats dont represent problem race baiting atti...
997                   quote mother wow hit hard accurate
998                                         video racist
999                           god narrator annoying lisp
Name: Text, Length: 1000, dtype: object


## supprimer les nombres 

In [7]:
import re

def remove_numbers(text):
    text = re.sub(r'\d+', '', text)
    return text

text4=text3.apply(remove_numbers)
print(text4)

0      people would take step back make case wasnt an...
1      law enforcement trained shoot apprehend traine...
2      dont reckon black lives matter banners held wh...
3      large number people like police officers calle...
4      arab dude absolutely right shot  extra time sh...
                             ...                        
995                       remember sent national defence
996    stats dont represent problem race baiting atti...
997                   quote mother wow hit hard accurate
998                                         video racist
999                           god narrator annoying lisp
Name: Text, Length: 1000, dtype: object


 ## Lemmatisation

In [8]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

text5= text4.apply(lemmatize_text)
print(text5)

0      people would take step back make case wasnt an...
1      law enforcement trained shoot apprehend traine...
2      dont reckon black life matter banner held whit...
3      large number people like police officer called...
4      arab dude absolutely right shot extra time sho...
                             ...                        
995                       remember sent national defence
996    stats dont represent problem race baiting atti...
997                   quote mother wow hit hard accurate
998                                         video racist
999                           god narrator annoying lisp
Name: Text, Length: 1000, dtype: object


##  Tokenisation

In [9]:
import pandas as pd
import nltk

nltk.download('punkt')

def tokenize_text(text):
    tokens = nltk.word_tokenize(text)
    return tokens

text5 = text4.apply(tokenize_text)
print(text5)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\x1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


0      [people, would, take, step, back, make, case, ...
1      [law, enforcement, trained, shoot, apprehend, ...
2      [dont, reckon, black, lives, matter, banners, ...
3      [large, number, people, like, police, officers...
4      [arab, dude, absolutely, right, shot, extra, t...
                             ...                        
995                  [remember, sent, national, defence]
996    [stats, dont, represent, problem, race, baitin...
997            [quote, mother, wow, hit, hard, accurate]
998                                      [video, racist]
999                      [god, narrator, annoying, lisp]
Name: Text, Length: 1000, dtype: object


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Convertir les tokens en une seule chaîne de caractères
text6 = text5.apply(lambda tokens: ' '.join(tokens))

# Créer une instance de TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Appliquer TF-IDF sur la colonne 'text5'
tfidf_matrix = tfidf_vectorizer.fit_transform(text6)

# Afficher les résultats
print(tfidf_matrix)

  (0, 1236)	0.09997292753143733
  (0, 214)	0.07287618202013336
  (0, 2741)	0.05527932215498991
  (0, 2643)	0.058381340759877866
  (0, 31)	0.08979752328021806
  (0, 19)	0.09574975319685995
  (0, 1801)	0.08753459049071222
  (0, 2998)	0.1059251574480792
  (0, 2408)	0.06827168740872576
  (0, 3699)	0.06477181233235305
  (0, 2199)	0.05727288949247191
  (0, 2134)	0.066668641229135
  (0, 3336)	0.07539894469442145
  (0, 3119)	0.09247400469155617
  (0, 2123)	0.09574975319685995
  (0, 398)	0.09247400469155617
  (0, 4516)	0.07287618202013336
  (0, 1309)	0.06123155207163181
  (0, 3086)	0.09574975319685995
  (0, 3542)	0.08753459049071222
  (0, 37)	0.07539894469442145
  (0, 130)	0.0671837819882737
  (0, 2742)	0.0736698891123569
  (0, 4007)	0.06433357067651978
  (0, 114)	0.06827168740872576
  :	:
  (996, 303)	0.22388811980923648
  (996, 2312)	0.17630257415418826
  (996, 380)	0.20467890755753648
  (996, 3725)	0.18342994388860634
  (996, 4128)	0.19605236002467874
  (996, 3900)	0.17851790506098084
  (996

In [11]:
# Accéder à la deuxième ligne de la matrice TF-IDF
row_indices = tfidf_matrix.indices[tfidf_matrix.indptr[1]:tfidf_matrix.indptr[2]]
row_values = tfidf_matrix.data[tfidf_matrix.indptr[1]:tfidf_matrix.indptr[2]]

# Afficher les indices et les valeurs correspondantes
for index, value in zip(row_indices, row_values):
    print(f"({index}, {value})")

(399, 0.24043170073906378)
(3128, 0.27205677164213876)
(2183, 0.23433928649000713)
(4420, 0.2219322201781172)
(2180, 0.2219322201781172)
(188, 0.27205677164213876)
(3580, 0.402620496829083)
(4118, 0.5748168589393047)
(1262, 0.25143479987856304)
(2233, 0.21218783074046524)
(4007, 0.1999481970310058)


In [12]:
from sklearn.cluster import KMeans

# Créer une instance de K-means
kmeans = KMeans(2)

# Ajuster le modèle aux données TF-IDF
kmeans.fit(tfidf_matrix)

# Prédire les clusters pour les données TF-IDF
cluster_labels = kmeans.predict(tfidf_matrix)

# Afficher les étiquettes de cluster attribuées à chaque document
for idx, label in enumerate(cluster_labels):
    print(f"Commantaire {idx}: Cluster {label}")


Commantaire 0: Cluster 0
Commantaire 1: Cluster 0
Commantaire 2: Cluster 0
Commantaire 3: Cluster 0
Commantaire 4: Cluster 0
Commantaire 5: Cluster 0
Commantaire 6: Cluster 0
Commantaire 7: Cluster 0
Commantaire 8: Cluster 0
Commantaire 9: Cluster 0
Commantaire 10: Cluster 0
Commantaire 11: Cluster 0
Commantaire 12: Cluster 0
Commantaire 13: Cluster 0
Commantaire 14: Cluster 0
Commantaire 15: Cluster 0
Commantaire 16: Cluster 0
Commantaire 17: Cluster 0
Commantaire 18: Cluster 0
Commantaire 19: Cluster 0
Commantaire 20: Cluster 0
Commantaire 21: Cluster 0
Commantaire 22: Cluster 0
Commantaire 23: Cluster 0
Commantaire 24: Cluster 0
Commantaire 25: Cluster 0
Commantaire 26: Cluster 0
Commantaire 27: Cluster 0
Commantaire 28: Cluster 0
Commantaire 29: Cluster 0
Commantaire 30: Cluster 0
Commantaire 31: Cluster 0
Commantaire 32: Cluster 0
Commantaire 33: Cluster 0
Commantaire 34: Cluster 0
Commantaire 35: Cluster 0
Commantaire 36: Cluster 0
Commantaire 37: Cluster 0
Commantaire 38: Cluste

In [16]:
print(df["IsRacist"])

0      False
1      False
2      False
3      False
4      False
       ...  
995    False
996     True
997    False
998    False
999    False
Name: IsRacist, Length: 1000, dtype: bool


In [17]:
import numpy as np 

#convertir False to 0 et True to 1 
Racist=[]
for i in range(len(df["IsRacist"])):
    if df["IsRacist"][i]==False:
        Racist.append(1)
    else:
        Racist.append(0)
        
#comparer le result et la colone isRacist
comp=0
for i in range(len(Racist)):
    if toxic[i]!=cluster_labels[i]:
        comp=comp+1

print("Le nombre de commentaires racistes : ", len(Racist))
print("Le nombre de similarités : ", comp)

Le nombre de commentaires racistes :  1000
Le nombre de similarités :  854


In [15]:
racist_percentage = (comp/len(Racist)) * 100
print("On a", racist_percentage, "% de commentaires racistes.")

On a 85.39999999999999 % de commentaires racistes.
