# Preprocessing

L'installation du package "contractions" peut poser problème. En particulier, "!pip install contractions" n'a pas fonctionné ici (l'installation conjointe du package pyahocorasick posant problème).

La solution qui a fonctionné est la suivante : dans le terminal Anaconda Prompt, effectuer l'installation de pyahocorasick via "pip install pyahocorasick", puis installer contractions via "pip install contractions".


In [12]:
import os
import re
import tempfile
import time
import zipfile

import contractions
import nltk
import pandas as pd
import requests
from IPython import get_ipython
from IPython.display import display
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

pd.set_option('max_colwidth', 400)

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Utilisateur\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Utilisateur\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Utilisateur\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Utilisateur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
def get_wordnet_pos(tag):
    """Fonction qui associe une classe à chaque mot"""
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [14]:
NEGATION_SET = {'no', 'not'}

In [15]:
def negate_sequence(word_list):
    """Fonction qui lie un mot négatif (no ou not) au mot auquel il s'applique"""
    if len(word_list) > 1:
        if word_list[0] in NEGATION_SET:
            return [f"not_{word_list[1]}"] + negate_sequence(word_list[2:])
        else:
            return [word_list[0]] + negate_sequence(word_list[1:])
    else:
        return word_list

In [16]:
word_list = ["I", "am", "not", "young", "but", "not", "not", "old", "either"]
negate_sequence(word_list)

['I', 'am', 'not_young', 'but', 'not_not', 'old', 'either']

In [17]:
def text_preprocessing(serie, stop_words, lemmatization, negation):
    """
    Fonction qui réalise toutes les opérations appliquées au texte au cours de la phase de preprocessing
    stop_words, lemmatization et negation sont des paramètres booléens, qui permettent de choisir si l'on 
    applique les fonctions pareillement nommées au texte.
    """
    
    # passage en minuscule
    serie = serie.map(lambda x: x.lower())   

    # suppression des sauts de lignes et des retours à la ligne
    serie = serie.map(lambda x: re.sub(r'[\r|\n|\r\n]+', ' ', x))

    # suppression des @tag
    serie = serie.map(lambda x: re.sub(r'@[\S]+', '', x))

    # suppression de l'URL
    serie = serie.map(lambda x: re.sub('https?://[\S]+', '', x))
    
    # réécriture des contractions ( I'm -> i am)
    serie = serie.map(lambda x: contractions.fix(x).lower())
    
    # suppression des hashtags et des nombres
    serie = serie.map(lambda x: re.sub("[^a-zA-Z]", " ", x))

    # tokenisation
    serie = serie.map(word_tokenize)

    if stop_words:        
        # suppression des mots vides ( i like reading, so i read -> like reading read)
        stop_words = set(stopwords.words('english')).difference(NEGATION_SET)
        serie = serie.map(lambda x: [word for word in x if word not in stop_words])
    
    if lemmatization:
        # lemmatisation, simplification des mots ( feet -> foot)    
        serie = serie.map(nltk.tag.pos_tag)
        serie = serie.map(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])
        wordnet_lemmatizer = WordNetLemmatizer()
        serie = serie.map(lambda x: [wordnet_lemmatizer.lemmatize(word, tag) for (word, tag) in x])
    
    if negation:
        serie = serie.map(lambda x: negate_sequence(x))

    serie = serie.map(lambda x: ' '.join(word for word in x))

    return serie

In [18]:
text = "i don't like it. this was aweful. this movie should definitely not be seen by children."
for stop_words in [False, True]:
    for lemmatization in [False, True]:
        for negation in [False, True]:
            print(text_preprocessing(pd.Series([text]), stop_words=stop_words, lemmatization=lemmatization, negation=negation))

0    i do not like it this was aweful this movie should definitely not be seen by children
dtype: object
0    i do not_like it this was aweful this movie should definitely not_be seen by children
dtype: object
0    i do not like it this be aweful this movie should definitely not be see by child
dtype: object
0    i do not_like it this be aweful this movie should definitely not_be see by child
dtype: object
0    not like aweful movie definitely not seen children
dtype: object
0    not_like aweful movie definitely not_seen children
dtype: object
0    not like aweful movie definitely not see child
dtype: object
0    not_like aweful movie definitely not_see child
dtype: object


# Récupération de la base Sentiment140

In [19]:
url, destname = 'http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip', 'sentiment140'
temporary_location = "temp"

def download_unzip(url, dirname=tempfile.gettempdir(), destname="file"):
    myfile = requests.get(url)
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    open(os.path.join(dirname, destname + '.zip'), 'wb').write(myfile.content)
    with zipfile.ZipFile(os.path.join(dirname, destname + '.zip'), 'r') as zip_ref:
        zip_ref.extractall(os.path.join(dirname, destname))

In [20]:
download_unzip(url, dirname=temporary_location, destname=destname)

In [21]:
trainfile = os.path.join(temporary_location, destname, "training.1600000.processed.noemoticon.csv")

columns = ['sentiment', 'id', 'date', 'query_string', 'user', 'text']

In [22]:
df = pd.read_csv(trainfile, header=None, names=columns, encoding='latin-1')

df.head(50)

Unnamed: 0,sentiment,id,date,query_string,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there."
5,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
6,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,Need a hug
7,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,"@LOLTrish hey long time no see! Yes.. Rains a bit ,only a bit LOL , I'm fine thanks , how's you ?"
8,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,@Tatiana_K nope they didn't have it
9,0,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,@twittera que me muera ?


# Preprocessing de la base Sentiment 140

Ce code n'a été (et ne doit être) éxécuté qu'une seule fois, pour construire et sauvegarder les différents états de la base 
Sentiment 140, selon que les fonctions stop_words, lemmatization et negation lui ont été appliquées ou non.

Si toutefois il fallait éxécuter à nouveau ce code, il faudrait alors faire attention à bien spécifier "l'endroit où les fichiers créés doivent être sauvegardés", en modifiant la variable path.

In [26]:
for stop_words in [False, True]:
    for lemmatization in [False, True]:
        for negation in [False, True]:
            dfcopy = df.copy()
            a = time.time()
            dfcopy.text = text_preprocessing(dfcopy.text, stop_words=stop_words, lemmatization=lemmatization, negation=negation)
            dfcopy = dfcopy[dfcopy.text != '']

            print(f"stop_words: {stop_words}, lemmatization: {lemmatization}, negation: {negation}, time: {time.time() - a}")
            display(dfcopy[['text']].head(10))

            file = "train"  #Le nom du fichier créé dépend des fonctions (stop_words, lemmatization, negation) qui ont été appliquées à la base Sentiment 140
            if stop_words:
                file += "_stop"
            if lemmatization:
                file += "_lemm"
            if negation:
                file += "_neg"
            
            path = "C:/Users/Utilisateur/Desktop/ENSAE/2A/Projet Python/Python_2A/Data/Sentiment140"
            dfcopy.to_pickle(os.path.join(path, file+".bz2")) #Création du fichier à l'adresse souhaitée

stop_words: False, lemmatization: False, negation: False, time: 264.8257255554199


Unnamed: 0,text
0,awww that is a bummer you shoulda got david carr of third day to do it d
1,is upset that he can not update his facebook by texting it and might cry as a result school today also blah
2,i dived many times for the ball managed to save the rest go out of bounds
3,my whole body feels itchy and like its on fire
4,no it is not behaving at all i am mad why am i here because i can not see you all over there
5,not the whole crew
6,need a hug
7,hey long time no see yes rains a bit only a bit lol i am fine thanks how is you
8,nope they did not have it
9,que me muera


stop_words: False, lemmatization: False, negation: True, time: 278.1080057621002


Unnamed: 0,text
0,awww that is a bummer you shoulda got david carr of third day to do it d
1,is upset that he can not_update his facebook by texting it and might cry as a result school today also blah
2,i dived many times for the ball managed to save the rest go out of bounds
3,my whole body feels itchy and like its on fire
4,not_it is not_behaving at all i am mad why am i here because i can not_see you all over there
5,not_the whole crew
6,need a hug
7,hey long time not_see yes rains a bit only a bit lol i am fine thanks how is you
8,nope they did not_have it
9,que me muera


stop_words: False, lemmatization: True, negation: False, time: 24471.20914697647


Unnamed: 0,text
0,awww that be a bummer you shoulda get david carr of third day to do it d
1,be upset that he can not update his facebook by texting it and might cry a a result school today also blah
2,i dive many time for the ball manage to save the rest go out of bound
3,my whole body feel itchy and like it on fire
4,no it be not behave at all i be mad why be i here because i can not see you all over there
5,not the whole crew
6,need a hug
7,hey long time no see yes rain a bit only a bit lol i be fine thanks how be you
8,nope they do not have it
9,que me muera


stop_words: False, lemmatization: True, negation: True, time: 3187.194974422455


Unnamed: 0,text
0,awww that be a bummer you shoulda get david carr of third day to do it d
1,be upset that he can not_update his facebook by texting it and might cry a a result school today also blah
2,i dive many time for the ball manage to save the rest go out of bound
3,my whole body feel itchy and like it on fire
4,not_it be not_behave at all i be mad why be i here because i can not_see you all over there
5,not_the whole crew
6,need a hug
7,hey long time not_see yes rain a bit only a bit lol i be fine thanks how be you
8,nope they do not_have it
9,que me muera


stop_words: True, lemmatization: False, negation: False, time: 295.50067806243896


Unnamed: 0,text
0,awww bummer shoulda got david carr third day
1,upset not update facebook texting might cry result school today also blah
2,dived many times ball managed save rest go bounds
3,whole body feels itchy like fire
4,no not behaving mad not see
5,not whole crew
6,need hug
7,hey long time no see yes rains bit bit lol fine thanks
8,nope not
9,que muera


stop_words: True, lemmatization: False, negation: True, time: 298.23095655441284


Unnamed: 0,text
0,awww bummer shoulda got david carr third day
1,upset not_update facebook texting might cry result school today also blah
2,dived many times ball managed save rest go bounds
3,whole body feels itchy like fire
4,not_not behaving mad not_see
5,not_whole crew
6,need hug
7,hey long time not_see yes rains bit bit lol fine thanks
8,nope not
9,que muera


stop_words: True, lemmatization: True, negation: False, time: 2778.2597138881683


Unnamed: 0,text
0,awww bummer shoulda get david carr third day
1,upset not update facebook texting might cry result school today also blah
2,dive many time ball manage save rest go bound
3,whole body feel itchy like fire
4,no not behave mad not see
5,not whole crew
6,need hug
7,hey long time no see yes rain bite bit lol fine thanks
8,nope not
9,que muera


stop_words: True, lemmatization: True, negation: True, time: 2821.2929928302765


Unnamed: 0,text
0,awww bummer shoulda get david carr third day
1,upset not_update facebook texting might cry result school today also blah
2,dive many time ball manage save rest go bound
3,whole body feel itchy like fire
4,not_not behave mad not_see
5,not_whole crew
6,need hug
7,hey long time not_see yes rain bite bit lol fine thanks
8,nope not
9,que muera
