- **Importation**

In [1]:
import os
import pandas as pd
import re
import nltk
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
stop_words = set(stopwords.words('english'))

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to C:\Users\Imad Eddine
[nltk_data]     Hajjane\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Imad Eddine
[nltk_data]     Hajjane\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Imad Eddine
[nltk_data]     Hajjane\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Imad Eddine
[nltk_data]     Hajjane\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
pd.set_option('display.max_colwidth', None)

In [4]:
CONTENT_PATH_CSV = os.path.join("content", "clean_tweet.csv")
RESULT_PATH_CSV = os.path.join("..", "result", "clean_tweet.csv")

In [7]:
def removeSpeCara(s:str):
    return re.sub(r"[^a-zA-Z]", "", s) 

#retirer les url
def remove_url(s:str)->str:
    url_pattern = re.compile(r"http?://\S+|https?://\S+|www\.\S+|//S+")
    return url_pattern.sub("r", s)

#retirer les html 
def remove_html(s:str)->str:
    html_pattern = re.compile(r"<.*?>")
    return html_pattern.sub("r", s)

# retirer les emojies
def remove_emoji(s:str)->str:
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF" 
        u"\U0001F680-\U0001F6FF"  
        u"\U0001F1E0-\U0001F1FF"  
        u"\U00002500-\U00002BEF"  
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"
        u"\u3030"
                      "]+", re.UNICODE)
    return emoji_pattern.sub("r", s)

def clean_and_lemmatize_string(s:str):
    l = []
    s_ = " ".join([remove_html(remove_url(word)) for word in s.split()])
    for word in word_tokenize(s_):
        word_ = removeSpeCara((remove_emoji(word)))
        if not word_ in stop_words:
            if len(word_) > 2:
                l.append(word_.lower())
    tag_map = defaultdict(lambda : wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV
    lemma_function = WordNetLemmatizer()
    return " ".join([lemma_function.lemmatize(token, tag_map[tag[0]]) for token, tag in pos_tag(l)])

- **Préparation : Recueillir les données, vérifier la qualité des données, s'assurer que les données sont compatibles avec le logiciel d'indexation utilisé**.

In [8]:
dataframe = pd.read_csv(os.path.join("..", "tweets_01-08-2021.csv"))[["id", "text"]]
dataframe

Unnamed: 0,id,text
0,98454970654916608,Republicans and Democrats have both created our economic problems.
1,1234653427789070336,"I was thrilled to be back in the Great city of Charlotte, North Carolina with thousands of hardworking American Patriots who love our Country, cherish our values, respect our laws, and always put AMERICA FIRST! Thank you for a wonderful evening!! #KAG2020 https://t.co/dNJZfRsl9y"
2,1218010753434820614,RT @CBS_Herridge: READ: Letter to surveillance court obtained by CBS News questions where there will be further disciplinary action and cho…
3,1304875170860015617,"The Unsolicited Mail In Ballot Scam is a major threat to our Democracy, &amp; the Democrats know it. Almost all recent elections using this system, even though much smaller &amp; with far fewer Ballots to count, have ended up being a disaster. Large numbers of missing Ballots &amp; Fraud!"
4,1218159531554897920,RT @MZHemingway: Very friendly telling of events here about Comey's apparent leaking to compliant media. If you read those articles and tho…
...,...,...
56566,1319485303363571714,RT @RandPaul: I don’t know why @JoeBiden thinks he can continue to lie about this. \n\nHe wants to ban fracking and end all fossil fuels like…
56567,1319484210101379072,RT @EliseStefanik: President @realDonaldTrump excels at communicating directly to the American people. \n\nJoe Biden communicates to the DC B…
56568,1319444420861829121,RT @TeamTrump: LIVE: Presidential Debate #Debates2020\n\nText VOTE to 88022 https://t.co/UeQOquVxR2
56569,1319384118849949702,Just signed an order to support the workers of Delphi Corporation and make sure that we protect the pensions of all American workers! Obama-Biden FAILED American workers and FAILED the workers of Delphi. I ALWAYS put American workers FIRST!


- **Suppression des urls et htmls**
- **Tokenisation : Division  des documents en unités de traitement.**
- **Nettoyage des données : Supprimer les données dupliquées, corriger les erreurs de saisie,supprimer les caractères spéciaux, normaliser les données (uniformisation de la casse)**.
- **Segmentation et Lemmatisation : Réduire d'un mot à sa forme de base ou racine.**

In [9]:
%%time
tqdm.pandas()
dataframe["text"] = dataframe["text"].progress_apply(lambda s : clean_and_lemmatize_string(s))
dataframe = dataframe[dataframe["text"].str.len() >= 3]

100%|██████████| 56571/56571 [00:45<00:00, 1249.08it/s]

CPU times: total: 42.2 s
Wall time: 45.3 s





In [10]:
dataframe

Unnamed: 0,id,text
0,98454970654916608,republican democrat create economic problem
1,1234653427789070336,thrill back great city charlotte north carolina thousand hardworking american patriot love country cherish value respect law always put america first thank wonderful evening kag
2,1218010753434820614,cbsherridge read letter surveillance court obtain cbs news question disciplinary action cho
3,1304875170860015617,the unsolicited mail ballot scam major threat democracy amp democrat know almost recent election use system even though much small amp far few ballots count end disaster large number miss ballot amp fraud
4,1218159531554897920,mzhemingway very friendly tell event comey apparent leaking compliant medium read article tho
...,...,...
56566,1319485303363571714,randpaul know joebiden think continue lie want ban fracking end fossil fuel like
56567,1319484210101379072,elisestefanik president realdonaldtrump excels communicate directly american people joe biden communicates
56568,1319444420861829121,teamtrump live presidential debate debate text vote
56569,1319384118849949702,just sign order support worker delphi corporation make sure protect pension american worker obamabiden fail american worker fail worker delphi always put american worker first


In [11]:
dataframe.to_csv(CONTENT_PATH_CSV, index=False)
dataframe.to_csv(RESULT_PATH_CSV, index=False)