# P7 : Détectez les Bad Buzz grâce au Deep Learning

## Traitement du contenu des tweets (pour modèle word2vec)

In [1]:
import os
import pandas as pd
df_dep = pd.read_csv('P7_03_fichiercsv_tweet.csv')
df_dep.head()

Unnamed: 0,label,date_du_poste,utilisateur,tweet
0,0,Mon Apr 06 22:19:49 PDT 2009,scotthamilton,is upset that he can't update his Facebook by ...
1,0,Mon Apr 06 22:19:53 PDT 2009,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,Mon Apr 06 22:19:57 PDT 2009,ElleCTF,my whole body feels itchy and like its on fire
3,0,Mon Apr 06 22:19:57 PDT 2009,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,Mon Apr 06 22:20:00 PDT 2009,joy_wolf,@Kwesidei not the whole crew


In [2]:
df_dep.label.value_counts()

4    800000
0    799999
Name: label, dtype: int64

In [3]:
df_dep['polarite'] = df_dep['label'].map({
    4 : 'positive',
    0 : 'negative'
})

In [4]:
df_dep.polarite.value_counts()

positive    800000
negative    799999
Name: polarite, dtype: int64

In [5]:
# Extrait des données
n_sample = 20000
df_neg= df_dep[df_dep['polarite'] == 'negative'].sample(n=n_sample)
df_pos = df_dep[df_dep['polarite'] == 'positive'].sample(n=n_sample)

data = pd.concat([df_neg, df_pos], ignore_index=True)

In [6]:
data.polarite.value_counts()

negative    20000
positive    20000
Name: polarite, dtype: int64

## Nettoyage des tweets

In [7]:
#*****************************************
# Préparation des librairie et des outils
# pour le nettoyage des tweets
#*****************************************

# Expressions régulières
import re

# Pontuations
import string
ponctuations = list(string.punctuation)
print(ponctuations)

# Tokénisation
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# Lemmatisation
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lem = WordNetLemmatizer()

# Charger les stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords
mots_vides = stopwords.words('english')
print('\n')
print(mots_vides)

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'wh

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\JK253\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\JK253\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\JK253\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Nettoyer le contenu des tweets

In [8]:
# fonction pour nettoyage de chaque document
# tweet = corps du tweet = document
# ponctuations : liste des ponctuations
# stopwords : liste des stopwords à retirer
# lem : fonction pour la lemmatisation des termes

def clean_tweet(tweet, ponctuations, stopwords, lem):
    # Harmonisation de la casse
    temp = tweet.lower()
    # retier les contractions en anglais
    temp = re.sub("'", "", temp)
    # retrait des @
    temp = re.sub("@[A-Za-z0-9_]+", "", temp)
    # retrait des #
    temp = re.sub("#[A-Za-z0-9_]+", "", temp)
    # retrait des liens web (http et https)
    temp = re.sub(r'http\S+','', temp)
    # retrait des ponctuations
    temp = "".join([char for char in list(temp) if not (char in ponctuations)])
    # retrait des nombres
    temp = re.sub("[0-9]", "", temp)
    # tokénisation
    temp = word_tokenize(temp)
    # lemmatisation des termes
    temp = [lem.lemmatize(mot) for mot in temp]
    # retrait des stopwords
    temp = [mot for mot in temp if not mot in stopwords]
    # retirer les tokens de moins de 3 caractères
    temp = [mot for mot in temp if len(mot) >= 3]
    return temp

In [9]:
# Appliquer le nettoyage au corpus
corpus = list(data.tweet)
corpus = [clean_tweet(doc, ponctuations, mots_vides, lem) for doc in corpus]

In [10]:
corpus[0]

['nooo',
 'theyre',
 'together',
 'kstew',
 'boyf',
 'like',
 'year',
 'sth',
 'man',
 'like',
 'see',
 'together',
 'though']

In [11]:
# Nouvelle dataframe
df_1 = pd.DataFrame({'label': data.polarite ,'tweet': corpus})
print("avant:.........")
data.head(10)

avant:.........


Unnamed: 0,label,date_du_poste,utilisateur,tweet,polarite
0,0,Sat Jun 20 23:20:52 PDT 2009,hmtangx,@ali_mwahxx nooo they're not together! kstew h...,negative
1,0,Sun May 31 10:49:22 PDT 2009,semipenguin,http://twitpic.com/6cflk - People would rather...,negative
2,0,Fri Jun 19 04:19:37 PDT 2009,merlinc,Yurgh. My code smells really badly today. Defi...,negative
3,0,Wed Jun 17 08:53:53 PDT 2009,sleepingqueen,@minnaeii thats very true na! people cant come...,negative
4,0,Mon Jun 22 18:58:21 PDT 2009,Vaes_mama,Has a bad headache...hasn't felt right in two ...,negative
5,0,Thu Jun 18 22:04:29 PDT 2009,tRAS,Day turning out to be disastrous. Lost bid in ...,negative
6,0,Sun May 31 19:12:14 PDT 2009,MrsHardy,Where is the preview,negative
7,0,Mon May 18 04:25:18 PDT 2009,BabyVanessa093,I'm siiick Well at least no school today xD,negative
8,0,Sat Apr 18 08:56:38 PDT 2009,JasonpPDX,Saturday Class is no good,negative
9,0,Mon Jun 15 11:38:30 PDT 2009,originalsteven,Back in Paisley,negative


In [12]:
print("après:.........")
df_1.head(10)

après:.........


Unnamed: 0,label,tweet
0,negative,"[nooo, theyre, together, kstew, boyf, like, ye..."
1,negative,"[people, would, rather, watch, death, wish, na..."
2,negative,"[yurgh, code, smell, really, badly, today, def..."
3,negative,"[thats, true, people, cant, come, peace]"
4,negative,"[bad, headachehasnt, felt, right, two, day, fi..."
5,negative,"[day, turning, disastrous, lost, bid, ebay]"
6,negative,[preview]
7,negative,"[siiick, well, least, school, today]"
8,negative,"[saturday, class, good]"
9,negative,"[back, paisley]"


In [13]:
# des tweets vides après nettoyage?
print(df_1.loc[df_1.tweet==""].shape[0])

0


In [14]:
# retrait des tweets correspondants
df_ok = df_1.loc[df_1.tweet != ""]
print(df_ok.shape)

(40000, 2)


## MODELE WOR2VEC

## Charger une représentation pré-entraînée des termes

In [15]:
path = '../P7/'

In [16]:
#chargement
import gensim
import gensim.models.keyedvectors as word2vec
trained = word2vec.KeyedVectors.load_word2vec_format(path+"enwiki_20180420_100d.txt.bz2",binary=False,unicode_errors='ignore')

In [17]:
#dimension
print(trained.vectors.shape)

(4530030, 100)


In [18]:
#coordonnées de mcenroe(joueur de tenis)
print(trained['mcenroe'])

[ 0.5573  0.4365  0.1881  0.3147  0.6119  0.2175  0.6588  0.2651  0.4745
 -0.5842  0.368   0.7477  0.3323 -1.0548 -0.1766 -0.2134 -0.0887  0.1165
  0.4452  0.7476  0.1865  0.1122 -0.6044 -0.6743 -1.1116 -0.4325  0.0572
 -0.2212  0.2282  1.3615  1.151  -0.1707  0.0887 -0.0732  0.4425 -0.2659
 -0.4304  0.6612  0.4771  0.12    0.4184  0.6925 -0.5642  0.1899  0.3655
  0.4986 -0.2736 -0.2063 -0.332   0.1967  0.8136  0.1608 -0.337  -0.5333
  0.2754 -0.0651 -0.2192  1.0819  0.3567 -0.1346  0.1839 -0.7201 -0.1903
  0.3891  1.6501 -0.0998 -0.1689 -0.4305 -0.5942  0.0181  0.2077 -0.7044
 -0.2112 -0.3843 -1.4317  0.002   0.0768 -0.5053 -0.6565 -0.3063  0.1394
 -0.1958 -0.0881 -0.6662 -0.0591  0.1755  0.3093  0.2115  0.3597 -0.2962
  0.4297 -0.0404  0.3238  0.4451  0.8552 -0.4519 -0.3047  0.0202  0.2848
 -0.1223]


In [19]:
#similarité avec mcenroe(joueur de tennis)
print(trained.most_similar(['mcenroe']))

[('ENTITY/John_McEnroe', 0.8751851916313171), ('gerulaitis', 0.8309396505355835), ('wilander', 0.8211947679519653), ('ENTITY/Wojciech_Fibak', 0.8087114691734314), ('kafelnikov', 0.8011326193809509), ('ENTITY/Stefan_Edberg', 0.8009007573127747), ('lendl', 0.8008507490158081), ('sampras', 0.7997835874557495), ('ENTITY/Peter_Fleming_(tennis)', 0.7964615225791931), ('ENTITY/Jim_Grabb', 0.7897493839263916)]


In [20]:
#l'exemple emblématique de word2vec
print(trained.most_similar(positive=['king','woman'],negative=['man']))

[('queen', 0.8306491374969482), ('monarch', 0.7416261434555054), ('ENTITY/Queen_consort', 0.7348717451095581), ('laungshe', 0.7347309589385986), ('regnant', 0.7243735194206238), ('chelna', 0.7236213684082031), ('consort', 0.720160722732544), ('indlovukati', 0.7181541919708252), ('kamamalu', 0.7178552150726318), ('indlovukazi', 0.714848518371582)]


## Coordonnées des documents à partir des termes

Représentation d'un document : moyenne des vecteurs des termes qui le composent.

In [21]:
#librairie numpy
import numpy

#fonction pour transformer un document en vecteur
#à partir des tokens qui le composent
#entrée : doc à traiter
#         modèle préentrainé
#sortie : vecteur représentant le document
def my_doc_2_vec(doc,trained):
    #dimension de représentation
    p = trained.vectors.shape[1]
    #initialiser le vecteur
    vec = numpy.zeros(p)
    #nombre de tokens trouvés
    nb = 0
    #traitement de chaque token du document
    for tk in doc:
        #ne traiter que les tokens reconnus
        try:
            values = trained[tk]
            vec = vec + values
            nb = nb + 1.0
        except:
            pass
    #faire la moyenne des valeurs
    #uniquement si on a trové des tokens reconnus bien sûr
    if (nb > 0.0):
        vec = vec/nb
    #renvoyer le vecteur
    #si aucun token trouvé, on a un vecteur de valeurs nulles
    return vec

In [22]:
#traiter les documents du corpus corpus
docsVec = list()
#pour chaque document du corpus nettoyé
for doc in df_ok['tweet']:
    #calcul de son vecteur
    vec = my_doc_2_vec(doc,trained)
    #ajouter dans la liste
    docsVec.append(vec)
#transformer en matrice numpy
matVec = numpy.array(docsVec)
print(matVec.shape)

(40000, 100)


In [23]:
#transformer en data frame
df = pd.DataFrame(matVec,columns=["v"+str(i+1) for i in range(matVec.shape[1])])
df.head()

Unnamed: 0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,...,v91,v92,v93,v94,v95,v96,v97,v98,v99,v100
0,0.165436,0.239482,0.227591,0.059136,-0.088518,-0.198773,0.136136,0.154564,-0.170955,-0.266309,...,0.045909,0.246582,0.218609,-0.064755,-0.060573,-0.297136,0.115518,0.303364,-0.395736,0.089327
1,0.065757,0.414686,-0.008686,0.255243,-0.008914,-0.310071,0.269971,0.158771,0.101829,-0.003029,...,0.169314,0.249471,0.2202,-0.203343,0.205729,-0.526671,0.035886,0.333843,-0.307271,0.055514
2,0.148278,0.328567,0.040078,0.066744,-0.007978,-0.202856,0.140478,-0.029811,-0.054144,-0.129011,...,-0.046856,0.198922,0.241233,-0.060744,0.233622,-0.289878,0.008967,0.331367,-0.3961,-0.037822
3,0.119733,0.337883,0.209933,0.1741,-0.227033,-0.17605,0.2725,-0.02225,-0.175967,0.217267,...,-0.051933,0.41165,0.320783,-0.10265,0.105333,-0.282683,0.142717,0.144983,-0.406317,0.0613
4,0.102327,0.495418,0.105991,0.052755,-0.0911,-0.122655,0.006882,0.083227,0.053082,-0.185927,...,-0.001836,0.186536,0.352745,-0.090218,0.047673,-0.274909,0.202773,0.320991,-0.369436,0.052345


In [24]:
#ajouter la classe
df['label'] = df_ok.label
df.head()

Unnamed: 0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,...,v92,v93,v94,v95,v96,v97,v98,v99,v100,label
0,0.165436,0.239482,0.227591,0.059136,-0.088518,-0.198773,0.136136,0.154564,-0.170955,-0.266309,...,0.246582,0.218609,-0.064755,-0.060573,-0.297136,0.115518,0.303364,-0.395736,0.089327,negative
1,0.065757,0.414686,-0.008686,0.255243,-0.008914,-0.310071,0.269971,0.158771,0.101829,-0.003029,...,0.249471,0.2202,-0.203343,0.205729,-0.526671,0.035886,0.333843,-0.307271,0.055514,negative
2,0.148278,0.328567,0.040078,0.066744,-0.007978,-0.202856,0.140478,-0.029811,-0.054144,-0.129011,...,0.198922,0.241233,-0.060744,0.233622,-0.289878,0.008967,0.331367,-0.3961,-0.037822,negative
3,0.119733,0.337883,0.209933,0.1741,-0.227033,-0.17605,0.2725,-0.02225,-0.175967,0.217267,...,0.41165,0.320783,-0.10265,0.105333,-0.282683,0.142717,0.144983,-0.406317,0.0613,negative
4,0.102327,0.495418,0.105991,0.052755,-0.0911,-0.122655,0.006882,0.083227,0.053082,-0.185927,...,0.186536,0.352745,-0.090218,0.047673,-0.274909,0.202773,0.320991,-0.369436,0.052345,negative


## Prédiction et évaluation en test

In [25]:
#partition apprentissage test
from sklearn.model_selection import train_test_split
dfTrain, dfTest = train_test_split(df,train_size=0.7,stratify=df.label,random_state=0)
print(dfTrain.shape)
print(dfTest.shape)

(28000, 101)
(12000, 101)


Je mets un échantillonage stratifié pour avoir les mêmes proportion de classe

In [26]:
#SVM avec un noyau RBF par défaut
from sklearn.svm import SVC
clf = SVC(random_state=0)
clf.fit(dfTrain[dfTrain.columns[:-1]],dfTrain.label)

SVC(random_state=0)

In [27]:
#prédiction en test
pred = clf.predict(dfTest[dfTest.columns[:-1]])
print(pred.shape)

(12000,)


In [28]:
#évaluation des performances
from sklearn import metrics
print(metrics.classification_report(dfTest.label,pred))

              precision    recall  f1-score   support

    negative       0.70      0.70      0.70      6000
    positive       0.70      0.70      0.70      6000

    accuracy                           0.70     12000
   macro avg       0.70      0.70      0.70     12000
weighted avg       0.70      0.70      0.70     12000



## Déploiement sur un document supplémentaire

In [29]:
#document à traiter
blog_avion = "My next plane ticket has just been booked! I fly in March to the Philippines. I am currently in the process of preparing for my next trip: what are the places I would like to discover, which itinerary to choose and in which cities to sleep, which domestic flights I will book to move between the islands etc. A trip by plane is an action that is anticipated and prepared a minimum. We can also face hazards, and it is important to know how to react. I indicate in this article various and varied tips on the theme of air travel. During the flight, the time may seem long, especially depending on the duration of your trip. Take something to take care of, and what not to be embarrassed (Quies balls can be useful). Also, feel free to get up from time to time if you need to stretch your legs. Be aware that the toilet is unavailable during take-off and landing, take precautions. A tip, always carry a photocopy of your passport, it can help in case of loss or theft that would occur during the trip. We also often talk about flight delay or cancellation. The wait seems endless in these cases, and travelers too often lack information. It is therefore necessary to try to take his evil in patience. Delayed plane, cancelled flight,companies like Indemnflight can help you get compensated. Note that the delay must be more than 3 hours for compensation to be possible. The advantage of going through Indemnflight is that experts take care of the administrative procedures for you, and they know the rights of passengers well. Note that they are paid only when the compensation is validated. You therefore delegate the management of the request, and then pay them in return a part of the compensation. I personally have only 2h30 maximum of delay to my credit, and have never tested the claim. Finally, the loss of luggage is also a thorny subject! If you do not find it at the exit of the plane, go to a counter available at the airport to report it and find a solution. The price of a plane ticket varies a lot depending on many factors. The level of demand, the time of year, the day and time of booking and the booking time before the departure date are very important elements to take into account if you have in mind to make a good deal when booking your flight. To learn more, I advise you to read this article from my travel blog: How to find a cheap plane ticket? Follow these 11 tips! When booking your flight, check that the formalities of entry into the desired country are feasible on time (example: a visa application must be made in advance), and that your identity card and/or passport will still be valid. Note that for many destinations, the passport must often be valid for a period of 6 months from the date of arrival. It is advisable to book flights well in advance. You rarely get a good deal when you buy a plane ticket a few days before departure. And in this case we take the risk of no longer having a place. In the case of my next trip to the Philippines, it is for these reasons that I will soon book the 2 domestic flights. Especially since I am going to travel with the national airline, and I have the impression that the weight of the luggage allowed is less than the 23 kg allowed on my international flight. I must therefore take this into account. Finally, it is necessary to pay attention to the information provided during the reservation. An error on the last name, or a time on the dates can be expensive, there is in this case a risk of paying extra to make this type of change, or even having to cancel your trip. First of all, find out a little in advance about how you will reach the airport (car + parking, bus, train...). When packing your suitcase, try to think of everything you will need on the spot. Also be aware that some products are prohibited in cabin baggage (liquid products greater than 100ml, knives etc.). Check the airline's website for an exhaustive list, and avoid having certain products confiscated. It is advisable to arrive 1h30 to 2h in advance for short or medium haul flight. 2h to 3h are recommended for international flights. This delay varies according to several elements: is your plane ticket ready or are you going to do the formalities at the airport? Do you have luggage to drop off at the counter? How big is the airport and in which area will you board? To help you and go further, discover the guide: practical tips for flying the first time: the steps to follow at the airport. Do you have any other tips to share when flying? Or adventures to share? Feel free to indicate them in the comments."

In [30]:
# nettoyage
my_clean = clean_tweet(blog_avion, ponctuations, mots_vides,lem)
print(my_clean)

['next', 'plane', 'ticket', 'booked', 'fly', 'march', 'philippine', 'currently', 'process', 'preparing', 'next', 'trip', 'place', 'would', 'like', 'discover', 'itinerary', 'choose', 'city', 'sleep', 'domestic', 'flight', 'book', 'move', 'island', 'etc', 'trip', 'plane', 'action', 'anticipated', 'prepared', 'minimum', 'also', 'face', 'hazard', 'important', 'know', 'react', 'indicate', 'article', 'various', 'varied', 'tip', 'theme', 'air', 'travel', 'flight', 'time', 'may', 'seem', 'long', 'especially', 'depending', 'duration', 'trip', 'take', 'something', 'take', 'care', 'embarrassed', 'quies', 'ball', 'useful', 'also', 'feel', 'free', 'get', 'time', 'time', 'need', 'stretch', 'leg', 'aware', 'toilet', 'unavailable', 'takeoff', 'landing', 'take', 'precaution', 'tip', 'always', 'carry', 'photocopy', 'passport', 'help', 'case', 'loss', 'theft', 'would', 'occur', 'trip', 'also', 'often', 'talk', 'flight', 'delay', 'cancellation', 'wait', 'seems', 'endless', 'case', 'traveler', 'often', 'la

### Transformation du document en vecteur et prédiction

In [31]:
#avec le modèle pré-entrainé
#et avec la fonction ci-dessus
my_vec = my_doc_2_vec(my_clean,trained)
print(my_vec)

[-0.0066924   0.24769519  0.01260582  0.09944025 -0.0945843  -0.12585595
  0.10553544  0.06743696  0.12733747 -0.19930025 -0.1025438  -0.31723949
  0.23384025 -0.11691544  0.16146203 -0.28021165 -0.07882709 -0.13041165
  0.03096835 -0.08232962  0.05536278  0.1366281  -0.21849342  0.15471544
 -0.28031291 -0.0930243  -0.11302886 -0.24987747  0.21124785  0.48269671
  0.34228278 -0.10007367  0.02435519  0.28576127 -0.07939013 -0.03959949
 -0.1009843   0.04338355  0.15884658 -0.03122405 -0.09728861 -0.01106532
 -0.23192152  0.14099443 -0.17758    -0.19375215 -0.04172987  0.1744562
 -0.25844582 -0.00165392  0.03838709  0.16964937  0.06252785  0.0953357
  0.07928051  0.10612152 -0.13520861  0.04745443  0.03035924  0.1231081
 -0.15309443 -0.20351089 -0.05071848  0.09582506  0.53716279 -0.29640557
 -0.21854937 -0.26774304 -0.1368038  -0.35458279 -0.05029798  0.18931063
 -0.08680304  0.30487038 -0.26326861 -0.03460506 -0.26305165  0.14672835
 -0.14239139 -0.14594759 -0.01068709 -0.36625392  0.23

In [32]:
#prédiction avec le SVM
pred_my_doc = clf.predict(my_vec.reshape(1,-1))

#le commentaire est de nature positive
print(pred_my_doc)

['negative']


## Sauvegarder le modèle word2vec

In [36]:
import joblib

In [37]:
# maintenant je l’enregistre dans un fichier
joblib.dump(clf, 'P7_01_modèlesurmesure_03_embedding_WOR2VEC.pkl')

['P7_01_modèlesurmesure_03_embedding_WOR2VEC.pkl']