In [1]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
print("Num CPUs Available: ", len(tf.config.list_physical_devices('CPU')))

Num GPUs Available:  0
Num CPUs Available:  1


## Modele Avance

A la différence du notebook précédent, ici les deux étapes servant à faire la prédiction vont être remplacée :
- La traduction du mot en vecteur (word embedding) ne sera plus assurée par un CountVectorizer (qui ne faisait que compter les mots dans la phrase) mais par un modèle déjà entrainé qui place chaque mot comme un point dans un espace de grande dimension (word2vec ou GloVe)
- L'étape de prédiction, qui était faite avec une régression logistique sera assurée par un réseau de neurone mis en bout de chaine du modèle de word embedding.

## Context
This is the sentiment140 dataset. It contains 1,600,000 tweets extracted using the twitter api . The tweets have been annotated (0 = negative, 4 = positive) and they can be used to detect sentiment .

In [13]:
import pandas as pd

# import csv
raw_data = pd.read_csv('data/training.1600000.processed.noemoticon.csv', encoding='latin-1', names=['target', 'ids', 'date', 'flag', 'user', 'text'])
raw_data.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


## Content
It contains the following 6 fields:
1. target: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
2. ids: The id of the tweet
3. date: the date of the tweet
4. flag: The query. If there is no query, then this value is NO_QUERY.
5. user: the user that tweeted
6. text: the text of the tweet

In [14]:
data = raw_data.copy(deep=True)
data['target'] = data['target'].map(lambda x: 1 if x == 4 else 0).astype('bool')
data['ids'] = data['ids'].astype('str')
data['date'] = data['date'].astype('str') # We will convert them to date if useful later, now we just want to reduce the size of the dataframe
data['flag'] = data['flag'].astype('str')
data['user'] = data['user'].astype('str')
data['text'] = data['text'].astype('str')

In [15]:
for col in data.columns:
    print(col, data[col].dtype)
    print(data[col].head())
    print(data[col].value_counts())
    print('\n')

target bool
0    False
1    False
2    False
3    False
4    False
Name: target, dtype: bool
False    800000
True     800000
Name: target, dtype: int64


ids object
0    1467810369
1    1467810672
2    1467810917
3    1467811184
4    1467811193
Name: ids, dtype: object
2190457769    2
1972193428    2
1989776729    2
1989776908    2
1564543229    2
             ..
2197311196    1
2197311146    1
2197310899    1
2197310477    1
2193602129    1
Name: ids, Length: 1598315, dtype: int64


date object
0    Mon Apr 06 22:19:45 PDT 2009
1    Mon Apr 06 22:19:49 PDT 2009
2    Mon Apr 06 22:19:53 PDT 2009
3    Mon Apr 06 22:19:57 PDT 2009
4    Mon Apr 06 22:19:57 PDT 2009
Name: date, dtype: object
Mon Jun 15 12:53:14 PDT 2009    20
Fri May 29 13:40:04 PDT 2009    17
Fri May 22 05:10:17 PDT 2009    17
Mon Jun 15 13:39:50 PDT 2009    17
Fri Jun 05 14:13:07 PDT 2009    16
                                ..
Sun Jun 07 12:36:07 PDT 2009     1
Sun Jun 07 12:36:04 PDT 2009     1
Sun Jun 07 12:36:03 PDT

In [16]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
print(f"{len(stopwords)} stopwords in the original list")
[stopwords.remove(negative_word) for negative_word in ['t', 'against', 'no', 'nor', 'not', 'don', "don't", 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]]
print(f"{len(stopwords)} stopwords after removing the negative words")

179 stopwords in the original list
137 stopwords after removing the negative words


In [22]:
from nltk.stem import WordNetLemmatizer
import tensorflow as tf
import re

# We should later implement the same preprocessing process [here](https://nlp.stanford.edu/projects/glove/preprocess-twitter.rb)

def preprocess(textdata):
    processedText = []
    
    # Create Lemmatizer and Stemmer.
    wordLemm = WordNetLemmatizer()
    
    # Defining regex patterns.
    urlPattern        = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    userPattern       = '@[^\s]+'
    alphaPattern      = "[^a-zA-Z0-9]"
    sequencePattern   = r"(.)\1\1+"
    seqReplacePattern = r"\1\1"
    
    for tweet in textdata:
        tweet = tweet.lower()
        
        # Replace all URls with 'URL'
        tweet = re.sub(urlPattern,' URL',tweet)    
        # Replace @USERNAME to 'USER'.
        tweet = re.sub(userPattern,' USER', tweet)        
        # Replace all non alphabets.
        tweet = re.sub(alphaPattern, " ", tweet)
        # Replace 3 or more consecutive letters by 2 letter.
        tweet = re.sub(sequencePattern, seqReplacePattern, tweet)

        tweetwords = ''
        for word in tweet.split():
            #if word not in stopwordlist:
            if word not in stopwords:
                # Lemmatizing the word.
                word = wordLemm.lemmatize(word)
                tweetwords += (word+' ')
                
        processedText.append(tweetwords)        
    return processedText

In [50]:
import time

corpus = data['text']
t = time.time()
X = preprocess(corpus)
print(f"Preprocessing took {time.time()-t} seconds")

Preprocessing took 107.09063744544983 seconds


In [51]:
from sklearn.model_selection import train_test_split

Y = data['target']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

Now that the tweets are cleaned, we will create a dictionnary where every key is a number and value is a word and replace every word by its number in the tweets.
Then, we will padd these tweets in order to have a uniform length.

In [53]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(X)

word_index = tokenizer.word_index
vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary Size :", vocab_size)

Vocabulary Size : 248879


In [54]:
MAX_SEQUENCE_LENGTH = 180

X_train = tf.keras.utils.pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen = MAX_SEQUENCE_LENGTH)
X_test = tf.keras.utils.pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen = MAX_SEQUENCE_LENGTH)

print("Training X Shape:",X_train.shape)
print("Testing X Shape:",X_test.shape)

Training X Shape: (1280000, 180)
Testing X Shape: (320000, 180)


In [55]:
from sklearn.model_selection import train_test_split
import time
# Word2vec
import gensim.downloader

# we could train the model but in order to save time, we will load a model already trained on 2B tweets ([see here](https://github.com/RaRe-Technologies/gensim-data))
glove_vectors = gensim.downloader.load('glove-twitter-25') # Already 104MB
VECTORS_DIM = 25 # because twitter-25

glove_vectors.most_similar('twitter')

print(f"Most similar to 'twitter': {glove_vectors.most_similar('twitter')}")

Most similar to 'twitter': [('facebook', 0.948005199432373), ('tweet', 0.9403423070907593), ('fb', 0.9342358708381653), ('instagram', 0.9104824066162109), ('chat', 0.8964964747428894), ('hashtag', 0.8885937333106995), ('tweets', 0.8878158330917358), ('tl', 0.8778461217880249), ('link', 0.8778210878372192), ('internet', 0.8753897547721863)]


On créé une liste de mots de notre tokenizer et on va y associer les vecteurs qu'on a trouvé dans le modèle qu'on vient de charger. La couche Embedding servira donc juste à associer les mots (leurs numéros) à leur vecteurs.

In [62]:
import numpy as np

# convert the wv word vectors into a numpy matrix that is suitable for insertion into our TensorFlow and Keras models
embedding_matrix = np.zeros((vocab_size, VECTORS_DIM))
for word, i in word_index.items():
    if glove_vectors.has_index_for(word):
        embedding_matrix[i] = glove_vectors.get_vector(word)

In [70]:
embedding_layer = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=VECTORS_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)

In [82]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
print("Num CPUs Available: ", len(tf.config.list_physical_devices('CPU')))

Num GPUs Available:  0
Num CPUs Available:  1


In [78]:
from tensorflow.keras.layers import Conv1D, Bidirectional, LSTM, Dense, Input, Dropout
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.callbacks import ModelCheckpoint

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding_sequences = embedding_layer(sequence_input)
x = SpatialDropout1D(0.2)(embedding_sequences)
x = Conv1D(64, 5, activation='relu')(x)
x = Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2))(x)
x = Dense(512, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(512, activation='relu')(x)
outputs = Dense(1, activation='sigmoid')(x)
model = tf.keras.Model(sequence_input, outputs)

In [79]:
tf.config.list_physical_devices('GPU')

[]

In [77]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau


model.compile(optimizer=Adam(learning_rate=1e-3), loss='binary_crossentropy', metrics=['accuracy'])
ReduceLROnPlateau = ReduceLROnPlateau(factor=0.1, min_lr = 0.01, monitor = 'val_loss', verbose = 1)

history = model.fit(X_train, Y_train, batch_size = 1024, epochs=10, validation_data=(X_test, Y_test), callbacks=[ReduceLROnPlateau])

Epoch 1/10


KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc, confusion_matrix



def evaluate(model, Y_test, Y_pred):
    cnf_matrix = confusion_matrix(Y_test, Y_pred)
    ax = sns.heatmap(cnf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels')
    ax.set_title('Confusion matrix')
    plt.gcf().set_facecolor('white')
    plt.show()

    roc = roc_curve(Y_test, Y_pred)
    roc_auc = auc(roc[0], roc[1])

    plt.figure()
    plt.plot(roc[0], roc[1], color='darkorange', label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy',linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.gcf().set_facecolor('white')
    plt.show()