# 1 - Fonctions de Preprocessing

L'installation du package "contractions" peut poser problème. En particulier, "!pip install contractions" n'a pas fonctionné ici (l'installation conjointe du package pyahocorasick posant problème).

La solution qui a fonctionné est la suivante : dans le terminal Anaconda Prompt, effectuer l'installation de pyahocorasick via "pip install pyahocorasick", puis installer contractions via "pip install contractions".


In [2]:
!pip install wordcloud

import os
import re
import tempfile
import time
import zipfile

#import contractions
import nltk
import pandas as pd
import requests
from wordcloud import WordCloud
from IPython import get_ipython
from IPython.display import display
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

pd.set_option('max_colwidth', 400)

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
def get_wordnet_pos(tag):
    """Fonction qui associe une classe à chaque mot"""
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [4]:
NEGATION_SET = {'no', 'not'}

In [5]:
def negate_sequence(word_list):
    """Fonction qui lie un mot négatif (no ou not) au mot auquel il s'applique"""
    if len(word_list) > 1:
        if word_list[0] in NEGATION_SET:
            return [f"not_{word_list[1]}"] + negate_sequence(word_list[2:])
        else:
            return [word_list[0]] + negate_sequence(word_list[1:])
    else:
        return word_list

In [6]:
word_list = ["I", "am", "not", "young", "but", "not", "not", "old", "either"]
negate_sequence(word_list)

['I', 'am', 'not_young', 'but', 'not_not', 'old', 'either']

In [7]:
def text_preprocessing(serie, stop_words, lemmatization, negation):
    """
    Fonction qui réalise toutes les opérations appliquées au texte au cours de la phase de preprocessing
    stop_words, lemmatization et negation sont des paramètres booléens, qui permettent de choisir si l'on 
    applique les fonctions pareillement nommées au texte.
    """
    
    # passage en minuscule
    serie = serie.map(lambda x: x.lower())   

    # suppression des sauts de lignes et des retours à la ligne
    serie = serie.map(lambda x: re.sub(r'[\r|\n|\r\n]+', ' ', x))

    # suppression des @tag
    serie = serie.map(lambda x: re.sub(r'@[\S]+', '', x))

    # suppression de l'URL
    serie = serie.map(lambda x: re.sub('https?://[\S]+', '', x))
    
    # réécriture des contractions ( I'm -> i am)
    serie = serie.map(lambda x: contractions.fix(x).lower())
    
    # suppression des hashtags et des nombres
    serie = serie.map(lambda x: re.sub("[^a-zA-Z]", " ", x))

    # tokenisation
    serie = serie.map(word_tokenize)

    if stop_words:        
        # suppression des mots vides ( i like reading, so i read -> like reading read)
        stop_words = set(stopwords.words('english')).difference(NEGATION_SET)
        serie = serie.map(lambda x: [word for word in x if word not in stop_words])
    
    if lemmatization:
        # lemmatisation, simplification des mots ( feet -> foot)    
        serie = serie.map(nltk.tag.pos_tag)
        serie = serie.map(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])
        wordnet_lemmatizer = WordNetLemmatizer()
        serie = serie.map(lambda x: [wordnet_lemmatizer.lemmatize(word, tag) for (word, tag) in x])
    
    if negation:
        serie = serie.map(lambda x: negate_sequence(x))

    serie = serie.map(lambda x: ' '.join(word for word in x))

    return serie

In [8]:
text = "i don't like it. this was aweful. this movie should definitely not be seen by children."
for stop_words in [False, True]:
    for lemmatization in [False, True]:
        for negation in [False, True]:
            print(text_preprocessing(pd.Series([text]), stop_words=stop_words, lemmatization=lemmatization, negation=negation))

NameError: ignored

# 2 - Base Sentiment140

## 2.1 - Récupération de la base Sentiment140

In [9]:
url, destname = 'http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip', 'sentiment140'
temporary_location = "temp"

def download_unzip(url, dirname=tempfile.gettempdir(), destname="file"):
    myfile = requests.get(url)
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    open(os.path.join(dirname, destname + '.zip'), 'wb').write(myfile.content)
    with zipfile.ZipFile(os.path.join(dirname, destname + '.zip'), 'r') as zip_ref:
        zip_ref.extractall(os.path.join(dirname, destname))

In [10]:
download_unzip(url, dirname=temporary_location, destname=destname)

In [11]:
trainfile = os.path.join(temporary_location, destname, "training.1600000.processed.noemoticon.csv")

columns = ['sentiment', 'id', 'date', 'query_string', 'user', 'text']

In [12]:
df = pd.read_csv(trainfile, header=None, names=columns, encoding='latin-1')

df.head(50)

Unnamed: 0,sentiment,id,date,query_string,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there."
5,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
6,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,Need a hug
7,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,"@LOLTrish hey long time no see! Yes.. Rains a bit ,only a bit LOL , I'm fine thanks , how's you ?"
8,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,@Tatiana_K nope they didn't have it
9,0,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,@twittera que me muera ?


# 2.2 - Preprocessing de la base Sentiment 140

Ce code n'a été (et ne doit être) éxécuté qu'une seule fois, pour construire et sauvegarder les différents états de la base 
Sentiment 140, selon que les fonctions stop_words, lemmatization et negation lui ont été appliquées ou non.

Si toutefois il fallait éxécuter à nouveau ce code, il faudrait alors faire attention à bien spécifier "l'endroit où les fichiers créés doivent être sauvegardés", en modifiant la variable path.

In [13]:
for stop_words in [False, True]:
    for lemmatization in [False, True]:
        for negation in [False, True]:
            dfcopy = df.copy()
            a = time.time()
            dfcopy.text = text_preprocessing(dfcopy.text, stop_words=stop_words, lemmatization=lemmatization, negation=negation)
            dfcopy = dfcopy[dfcopy.text != '']

            print(f"stop_words: {stop_words}, lemmatization: {lemmatization}, negation: {negation}, time: {time.time() - a}")
            display(dfcopy[['text']].head(10))

            file = "train"  #Le nom du fichier créé dépend des fonctions (stop_words, lemmatization, negation) qui ont été appliquées à la base Sentiment 140
            if stop_words:
                file += "_stop"
            if lemmatization:
                file += "_lemm"
            if negation:
                file += "_neg"
            
            path = "C:/Users/Utilisateur/Desktop/ENSAE/2A/Projet Python/Python_2A/data/Sentiment140"
            dfcopy.to_pickle(os.path.join(path, file+".bz2")) #Création du fichier à l'adresse souhaitée

NameError: ignored

# 3 - Base webscrapée

## 3.1 - Récupération de la base webscrapée

In [15]:
df = pd.read_pickle(os.path.join("data", "web", "web.csv"))
df

FileNotFoundError: ignored

# 3.2 - Visualisation avant préprocessing

In [14]:
tweets_df_dune = tweets_df[tweets_df['Film'] == 'dune']
tweets_df_space_jam = tweets_df[tweets_df['Film'] == 'space jam']

NameError: ignored

On réalise un wordcloud pour les tweets concernant le film Dune

In [None]:
text = tweets_df_dune.Text.values
wordcloud = WordCloud(
    width = 3000,
    height = 2000,
    background_color = 'white',
    stopwords = None).generate(str(text))
fig = plt.figure(
    figsize = (40, 30),
    facecolor = 'k',
    edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

On réalise un deuxième wordcloud, pour les tweets concernant le film Space Jam 2

In [None]:
text = tweets_df_space_jam.Text.values
wordcloud = WordCloud(
    width = 3000,
    height = 2000,
    background_color = 'white',
    stopwords = None).generate(str(text))
fig = plt.figure(
    figsize = (40, 30),
    facecolor = 'k',
    edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

## 3.2 - Préprocessing de la base websrapée

In [None]:
for stop_words in [False, True]:
    for lemmatization in [False, True]:
        for negation in [False, True]:
            dfcopy = df.copy()
            a = time.time()
            dfcopy.Text = text_preprocessing(dfcopy.Text, stop_words=stop_words, lemmatization=lemmatization, negation=negation)
            dfcopy = dfcopy[dfcopy.Text != '']

            print(f"stop_words: {stop_words}, lemmatization: {lemmatization}, negation: {negation}, time: {time.time() - a}")
            display(dfcopy[['Text']].head(10))
            text = dfcopy.Text.values
            wordcloud = WordCloud(
              width = 3000,
              height = 2000,
              background_color = 'white',
              stopwords = None).generate(str(text))
            fig = plt.figure(
              figsize = (40, 30),
              facecolor = 'k',
              edgecolor = 'k')
            plt.imshow(wordcloud, interpolation = 'bilinear')
            plt.axis('off')
            plt.tight_layout(pad=0)
            plt.show()

            file = "web"
            if stop_words:
                file += "_stop"
            if lemmatization:
                file += "_lemm"
            if negation:
                file += "_neg"
            dfcopy.to_pickle(os.path.join("data", "web", file + ".csv"))

stop_words: False, lemmatization: False, negation: False, time: 7.831490516662598


Unnamed: 0,Text
0,i want to see dune but the lack of muslim and mena actors when the novel supposedly references the middle east and islam is disappointing would have been a real good bit of representation for once especially in a big hollywood movie
1,i might be the only person that knows fuckall about dune but damn that trailer looks great and i love denis villenueve that dude has not made a bad movie that i have seen and i have seen almost all of them
2,when people try to claim dune is some higher level intelligent adult sci fi and i wonder if i watched the same movie dune
3,i never read dune but i am v glad you all that did are getting a good movie
4,the mix of styles is really strange the music is not matching the clip or the ambiance almost disgusting me also the casting is not that great i am going to for a movie with such ambition i have that feeling telling me it is going to be an okay movie and not more
5,this movie looks amazing can not wait to watch dunemovie
6,i guess with the dune movie coming out i might need to actually get around to reading dune
7,in honor of the dunetrailer release today dunemovie dune
8,the new trailer it hits some unexpected notes but how to bring it to a new generation of fans pink floyd can not wait for the release
9,king ilysm


stop_words: False, lemmatization: False, negation: True, time: 8.42314887046814


Unnamed: 0,Text
0,i want to see dune but the lack of muslim and mena actors when the novel supposedly references the middle east and islam is disappointing would have been a real good bit of representation for once especially in a big hollywood movie
1,i might be the only person that knows fuckall about dune but damn that trailer looks great and i love denis villenueve that dude has not_made a bad movie that i have seen and i have seen almost all of them
2,when people try to claim dune is some higher level intelligent adult sci fi and i wonder if i watched the same movie dune
3,i never read dune but i am v glad you all that did are getting a good movie
4,the mix of styles is really strange the music is not_matching the clip or the ambiance almost disgusting me also the casting is not_that great i am going to for a movie with such ambition i have that feeling telling me it is going to be an okay movie and not_more
5,this movie looks amazing can not_wait to watch dunemovie
6,i guess with the dune movie coming out i might need to actually get around to reading dune
7,in honor of the dunetrailer release today dunemovie dune
8,the new trailer it hits some unexpected notes but how to bring it to a new generation of fans pink floyd can not_wait for the release
9,king ilysm


stop_words: False, lemmatization: True, negation: False, time: 78.88456273078918


Unnamed: 0,Text
0,i want to see dune but the lack of muslim and mena actor when the novel supposedly reference the middle east and islam be disappoint would have be a real good bit of representation for once especially in a big hollywood movie
1,i might be the only person that know fuckall about dune but damn that trailer look great and i love denis villenueve that dude have not make a bad movie that i have see and i have see almost all of them
2,when people try to claim dune be some high level intelligent adult sci fi and i wonder if i watch the same movie dune
3,i never read dune but i be v glad you all that do be get a good movie
4,the mix of style be really strange the music be not match the clip or the ambiance almost disgust me also the casting be not that great i be go to for a movie with such ambition i have that feeling tell me it be go to be an okay movie and not more
5,this movie look amaze can not wait to watch dunemovie
6,i guess with the dune movie come out i might need to actually get around to read dune
7,in honor of the dunetrailer release today dunemovie dune
8,the new trailer it hit some unexpected note but how to bring it to a new generation of fan pink floyd can not wait for the release
9,king ilysm


stop_words: False, lemmatization: True, negation: True, time: 78.74864482879639


Unnamed: 0,Text
0,i want to see dune but the lack of muslim and mena actor when the novel supposedly reference the middle east and islam be disappoint would have be a real good bit of representation for once especially in a big hollywood movie
1,i might be the only person that know fuckall about dune but damn that trailer look great and i love denis villenueve that dude have not_make a bad movie that i have see and i have see almost all of them
2,when people try to claim dune be some high level intelligent adult sci fi and i wonder if i watch the same movie dune
3,i never read dune but i be v glad you all that do be get a good movie
4,the mix of style be really strange the music be not_match the clip or the ambiance almost disgust me also the casting be not_that great i be go to for a movie with such ambition i have that feeling tell me it be go to be an okay movie and not_more
5,this movie look amaze can not_wait to watch dunemovie
6,i guess with the dune movie come out i might need to actually get around to read dune
7,in honor of the dunetrailer release today dunemovie dune
8,the new trailer it hit some unexpected note but how to bring it to a new generation of fan pink floyd can not_wait for the release
9,king ilysm


stop_words: True, lemmatization: False, negation: False, time: 7.608641147613525


Unnamed: 0,Text
0,want see dune lack muslim mena actors novel supposedly references middle east islam disappointing would real good bit representation especially big hollywood movie
1,might person knows fuckall dune damn trailer looks great love denis villenueve dude not made bad movie seen seen almost
2,people try claim dune higher level intelligent adult sci fi wonder watched movie dune
3,never read dune v glad getting good movie
4,mix styles really strange music not matching clip ambiance almost disgusting also casting not great going movie ambition feeling telling going okay movie not
5,movie looks amazing not wait watch dunemovie
6,guess dune movie coming might need actually get around reading dune
7,honor dunetrailer release today dunemovie dune
8,new trailer hits unexpected notes bring new generation fans pink floyd not wait release
9,king ilysm


stop_words: True, lemmatization: False, negation: True, time: 8.20029592514038


Unnamed: 0,Text
0,want see dune lack muslim mena actors novel supposedly references middle east islam disappointing would real good bit representation especially big hollywood movie
1,might person knows fuckall dune damn trailer looks great love denis villenueve dude not_made bad movie seen seen almost
2,people try claim dune higher level intelligent adult sci fi wonder watched movie dune
3,never read dune v glad getting good movie
4,mix styles really strange music not_matching clip ambiance almost disgusting also casting not_great going movie ambition feeling telling going okay movie not
5,movie looks amazing not_wait watch dunemovie
6,guess dune movie coming might need actually get around reading dune
7,honor dunetrailer release today dunemovie dune
8,new trailer hits unexpected notes bring new generation fans pink floyd not_wait release
9,king ilysm


stop_words: True, lemmatization: True, negation: False, time: 61.89637470245361


Unnamed: 0,Text
0,want see dune lack muslim mena actor novel supposedly reference middle east islam disappoint would real good bit representation especially big hollywood movie
1,might person know fuckall dune damn trailer look great love denis villenueve dude not make bad movie see see almost
2,people try claim dune high level intelligent adult sci fi wonder watch movie dune
3,never read dune v glad get good movie
4,mix style really strange music not match clip ambiance almost disgust also cast not great go movie ambition feel tell go okay movie not
5,movie look amaze not wait watch dunemovie
6,guess dune movie come might need actually get around read dune
7,honor dunetrailer release today dunemovie dune
8,new trailer hit unexpected note bring new generation fan pink floyd not wait release
9,king ilysm


stop_words: True, lemmatization: True, negation: True, time: 64.32966113090515


Unnamed: 0,Text
0,want see dune lack muslim mena actor novel supposedly reference middle east islam disappoint would real good bit representation especially big hollywood movie
1,might person know fuckall dune damn trailer look great love denis villenueve dude not_make bad movie see see almost
2,people try claim dune high level intelligent adult sci fi wonder watch movie dune
3,never read dune v glad get good movie
4,mix style really strange music not_match clip ambiance almost disgust also cast not_great go movie ambition feel tell go okay movie not
5,movie look amaze not_wait watch dunemovie
6,guess dune movie come might need actually get around read dune
7,honor dunetrailer release today dunemovie dune
8,new trailer hit unexpected note bring new generation fan pink floyd not_wait release
9,king ilysm


La comparaison des différents wordclouds obtenus permet de mesurer visuellement l'efficacité du préprocessing et de prendre conscience des limites de certaines des méthodes appliquées.