# TP Text Mining 

## Charger le jeu de données

In [166]:
import pandas as pd

df = pd.read_csv('youtoxic_english_1000.csv')
df.head()
print(df["Text"])

0      If only people would just take a step back and...
1      Law enforcement is not trained to shoot to app...
2      \nDont you reckon them 'black lives matter' ba...
3      There are a very large number of people who do...
4      The Arab dude is absolutely right, he should h...
                             ...                        
995    I remember that they sent in the national defe...
996    Stats don`t represent the problem. Race baitin...
997    The quote from the mother... Wow that hit hard...
998                              this video is so racist
999        God, the narrator has such an annoying lisp. 
Name: Text, Length: 1000, dtype: object


# Nettoyage du texte

## suppression de ponctuation 


In [167]:
import pandas as pd
import string

def remove_punctuation(text):
    text = ''.join([char for char in text if char not in string.punctuation])
    return text

text= df['Text'].apply(remove_punctuation)
print(text1)

0      if only people would just take a step back and...
1      law enforcement is not trained to shoot to app...
2      \ndont you reckon them black lives matter bann...
3      there are a very large number of people who do...
4      the arab dude is absolutely right he should ha...
                             ...                        
995    i remember that they sent in the national defe...
996    stats dont represent the problem race baiting ...
997    the quote from the mother wow that hit hard ve...
998                              this video is so racist
999          god the narrator has such an annoying lisp 
Name: Text, Length: 1000, dtype: object


## Convertir en minuscules

In [168]:
import pandas as pd

def convert_to_lowercase(text):
    return text.lower()

text1 = text.apply(convert_to_lowercase)
print(text1)

0      if only people would just take a step back and...
1      law enforcement is not trained to shoot to app...
2      \ndont you reckon them black lives matter bann...
3      there are a very large number of people who do...
4      the arab dude is absolutely right he should ha...
                             ...                        
995    i remember that they sent in the national defe...
996    stats dont represent the problem race baiting ...
997    the quote from the mother wow that hit hard ve...
998                              this video is so racist
999          god the narrator has such an annoying lisp 
Name: Text, Length: 1000, dtype: object


## Suppression des mots vides

Suppression des mots vides à l'aide d'une liste de mots vides standard disponible dans NLTK pour l'anglais

In [169]:
from nltk.corpus import stopwords
nltk.download('stopwords')

def remove_stopwords(text):
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    return text

text2 = text1.apply(remove_stopwords)
print(text2)

0      people would take step back make case wasnt an...
1      law enforcement trained shoot apprehend traine...
2      dont reckon black lives matter banners held wh...
3      large number people like police officers calle...
4      arab dude absolutely right shot 6 extra time s...
                             ...                        
995                       remember sent national defence
996    stats dont represent problem race baiting atti...
997                   quote mother wow hit hard accurate
998                                         video racist
999                           god narrator annoying lisp
Name: Text, Length: 1000, dtype: object


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\x1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## supprimer les liens 

In [170]:
import pandas as pd
import re

def remove_urls(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'www\S+', '', text)
    return text

text3 = text2.apply(remove_urls)
print(text3)

0      people would take step back make case wasnt an...
1      law enforcement trained shoot apprehend traine...
2      dont reckon black lives matter banners held wh...
3      large number people like police officers calle...
4      arab dude absolutely right shot 6 extra time s...
                             ...                        
995                       remember sent national defence
996    stats dont represent problem race baiting atti...
997                   quote mother wow hit hard accurate
998                                         video racist
999                           god narrator annoying lisp
Name: Text, Length: 1000, dtype: object


## supprimer les nombres 

In [171]:
import re

def remove_numbers(text):
    text = re.sub(r'\d+', '', text)
    return text

text4=text3.apply(remove_numbers)
print(text4)

0      people would take step back make case wasnt an...
1      law enforcement trained shoot apprehend traine...
2      dont reckon black lives matter banners held wh...
3      large number people like police officers calle...
4      arab dude absolutely right shot  extra time sh...
                             ...                        
995                       remember sent national defence
996    stats dont represent problem race baiting atti...
997                   quote mother wow hit hard accurate
998                                         video racist
999                           god narrator annoying lisp
Name: Text, Length: 1000, dtype: object


 ## Lemmatisation

In [172]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

text5 = text4.apply(lemmatize_text)
print(text5)

0      people would take step back make case wasnt an...
1      law enforcement trained shoot apprehend traine...
2      dont reckon black life matter banner held whit...
3      large number people like police officer called...
4      arab dude absolutely right shot extra time sho...
                             ...                        
995                       remember sent national defence
996    stats dont represent problem race baiting atti...
997                   quote mother wow hit hard accurate
998                                         video racist
999                           god narrator annoying lisp
Name: Text, Length: 1000, dtype: object


##  Tokenisation

In [178]:
import pandas as pd
import nltk

nltk.download('punkt')

def tokenize_text(text):
    tokens = nltk.word_tokenize(text)
    return tokens

text5 = text4.apply(tokenize_text)
print(text5)

0      [people, would, take, step, back, make, case, ...
1      [law, enforcement, trained, shoot, apprehend, ...
2      [dont, reckon, black, lives, matter, banners, ...
3      [large, number, people, like, police, officers...
4      [arab, dude, absolutely, right, shot, extra, t...
                             ...                        
995                  [remember, sent, national, defence]
996    [stats, dont, represent, problem, race, baitin...
997            [quote, mother, wow, hit, hard, accurate]
998                                      [video, racist]
999                      [god, narrator, annoying, lisp]
Name: Text, Length: 1000, dtype: object


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\x1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
