In [22]:
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer



In [33]:
def supprimer_les_non_alphabet(words: list[str]) -> list[str]:
    filtered_words = []
    for word in words:
        if re.search(r'[a-zA-Z]', word):
            filtered_words.append(word)
    return filtered_words


In [34]:
def remove_stop_words(words: list[str]) -> list[str]:
    filtered_words = []
    for word in words:
        if word not in stopwords.words('english'):
            filtered_words.append(word)
    return filtered_words


In [35]:
def supprimer_les_junk(comments: list[list[str]]) -> list[list[str]]:
    filtered_comments = []
    for comment in comments:
        comment = remove_stop_words(comment)
        comment = remove_non_alphabetical_words(comment)
        filtered_comments.append(comment)
    return filtered_comments


In [36]:
def tokenize_comments(comments: list[str]) -> list[list[str]]:
    tokenized_comments = []
    for comment in comments:
        tokenized_comments.append(word_tokenize(comment))
    return tokenized_comments

In [37]:
def lower_texte(comments: list[str]) -> list[str]:
    lowercase_comments = []
    for comment in comments:
        lowercase_comments.append(comment.lower())
    return lowercase_comments

In [38]:
def comments_preprocessing(comments: pd.Series) -> list[list[str]]:
    """Returns list of words for each comment."""
    comments = comments.values.tolist()
    lowercase_comments = lower_comments(comments)
    tokenized_texts = tokenize_comments(lowercase_comments)
    texts = remove_junk_from_comments(tokenized_texts)
    return texts

In [39]:
 df = pd.read_csv('youtoxic_english_1000.csv')
comment=df['Text']
comment

0      If only people would just take a step back and...
1      Law enforcement is not trained to shoot to app...
2      \nDont you reckon them 'black lives matter' ba...
3      There are a very large number of people who do...
4      The Arab dude is absolutely right, he should h...
                             ...                        
995    I remember that they sent in the national defe...
996    Stats don`t represent the problem. Race baitin...
997    The quote from the mother... Wow that hit hard...
998                              this video is so racist
999        God, the narrator has such an annoying lisp. 
Name: Text, Length: 1000, dtype: object

In [40]:
r=remove_stop_words(comment)

In [41]:
r

["If only people would just take a step back and not make this case about them, because it wasn't about anyone except the two people in that situation.\xa0 To lump yourself into this mess and take matters into your own hands makes these kinds of protests selfish and without rational thought and investigation.\xa0 The guy in this video is heavily emotional and hyped up and wants to be heard, and when he gets heard he just presses more and more.\xa0 He was never out to have a reasonable discussion.\xa0 Kudos to the Smerconish for keeping level the whole time and letting Masri make himself out to be a fool.\xa0 How dare he and those that tore that city down in protest make this about themselves and to dishonor the entire incident with their own hate.\xa0 By the way, since when did police brutality become an epidemic?\xa0 I wish everyone would just stop pretending like they were there and they knew EXACTLY what was going on, because there's no measurable amount of people that honestly witn

In [42]:
import re

In [43]:
ri=supprimer_les_non_alphabet(r)

In [44]:
ri

["If only people would just take a step back and not make this case about them, because it wasn't about anyone except the two people in that situation.\xa0 To lump yourself into this mess and take matters into your own hands makes these kinds of protests selfish and without rational thought and investigation.\xa0 The guy in this video is heavily emotional and hyped up and wants to be heard, and when he gets heard he just presses more and more.\xa0 He was never out to have a reasonable discussion.\xa0 Kudos to the Smerconish for keeping level the whole time and letting Masri make himself out to be a fool.\xa0 How dare he and those that tore that city down in protest make this about themselves and to dishonor the entire incident with their own hate.\xa0 By the way, since when did police brutality become an epidemic?\xa0 I wish everyone would just stop pretending like they were there and they knew EXACTLY what was going on, because there's no measurable amount of people that honestly witn

In [45]:
l=lower_texte(ri)

In [46]:
l

["if only people would just take a step back and not make this case about them, because it wasn't about anyone except the two people in that situation.\xa0 to lump yourself into this mess and take matters into your own hands makes these kinds of protests selfish and without rational thought and investigation.\xa0 the guy in this video is heavily emotional and hyped up and wants to be heard, and when he gets heard he just presses more and more.\xa0 he was never out to have a reasonable discussion.\xa0 kudos to the smerconish for keeping level the whole time and letting masri make himself out to be a fool.\xa0 how dare he and those that tore that city down in protest make this about themselves and to dishonor the entire incident with their own hate.\xa0 by the way, since when did police brutality become an epidemic?\xa0 i wish everyone would just stop pretending like they were there and they knew exactly what was going on, because there's no measurable amount of people that honestly witn

In [47]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [48]:
ju=tokenize_comments(l)

In [49]:
ju

[['if',
  'only',
  'people',
  'would',
  'just',
  'take',
  'a',
  'step',
  'back',
  'and',
  'not',
  'make',
  'this',
  'case',
  'about',
  'them',
  ',',
  'because',
  'it',
  'was',
  "n't",
  'about',
  'anyone',
  'except',
  'the',
  'two',
  'people',
  'in',
  'that',
  'situation',
  '.',
  'to',
  'lump',
  'yourself',
  'into',
  'this',
  'mess',
  'and',
  'take',
  'matters',
  'into',
  'your',
  'own',
  'hands',
  'makes',
  'these',
  'kinds',
  'of',
  'protests',
  'selfish',
  'and',
  'without',
  'rational',
  'thought',
  'and',
  'investigation',
  '.',
  'the',
  'guy',
  'in',
  'this',
  'video',
  'is',
  'heavily',
  'emotional',
  'and',
  'hyped',
  'up',
  'and',
  'wants',
  'to',
  'be',
  'heard',
  ',',
  'and',
  'when',
  'he',
  'gets',
  'heard',
  'he',
  'just',
  'presses',
  'more',
  'and',
  'more',
  '.',
  'he',
  'was',
  'never',
  'out',
  'to',
  'have',
  'a',
  'reasonable',
  'discussion',
  '.',
  'kudos',
  'to',
  'the

In [50]:
junk=supprimer_les_junk(ju)

In [51]:
junk

[['people',
  'would',
  'take',
  'step',
  'back',
  'make',
  'case',
  "n't",
  'anyone',
  'except',
  'two',
  'people',
  'situation',
  'lump',
  'mess',
  'take',
  'matters',
  'hands',
  'makes',
  'kinds',
  'protests',
  'selfish',
  'without',
  'rational',
  'thought',
  'investigation',
  'guy',
  'video',
  'heavily',
  'emotional',
  'hyped',
  'wants',
  'heard',
  'gets',
  'heard',
  'presses',
  'never',
  'reasonable',
  'discussion',
  'kudos',
  'smerconish',
  'keeping',
  'level',
  'whole',
  'time',
  'letting',
  'masri',
  'make',
  'fool',
  'dare',
  'tore',
  'city',
  'protest',
  'make',
  'dishonor',
  'entire',
  'incident',
  'hate',
  'way',
  'since',
  'police',
  'brutality',
  'become',
  'epidemic',
  'wish',
  'everyone',
  'would',
  'stop',
  'pretending',
  'like',
  'knew',
  'exactly',
  'going',
  "'s",
  'measurable',
  'amount',
  'people',
  'honestly',
  'witnessed',
  'incident',
  'none',
  'us',
  'clue',
  'way',
  'whole',
  