In [1]:
# manipulation des données
import numpy as np
import pandas as pd

# matplotlib et seaborn pour les représentations graphiques
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

# sklearn preprocessing pour le traiter les variables catégorielles
from sklearn.preprocessing import LabelEncoder

# Gestion du système de fichiers
import os

# Suppression des alertes
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("../Data/2.sample_dataset.csv")
df

Unnamed: 0,target,id,date,flag,user,text,words
0,0,2054665466,Sat Jun 06 07:50:24 PDT 2009,NO_QUERY,boring_alice,ive had awesome day but the sun is missing wan...,ive awesome day sun miss want well weather
1,1,1823190427,Sat May 16 21:20:00 PDT 2009,NO_QUERY,yassychan,you will do great saw kevin teaching you,great saw kevin teach
2,0,1826975071,Sun May 17 09:43:20 PDT 2009,NO_QUERY,PRNCSmuriel3,its cold in md too,cold md
3,0,2202406793,Tue Jun 16 21:44:37 PDT 2009,NO_QUERY,mariazimmerman,does anyone know the girl that died of swine f...,anyone know girl die swine flu maybe go ucsd f...
4,0,2242106714,Fri Jun 19 11:46:38 PDT 2009,NO_QUERY,katrinachelsea,watching amelie and wishing was french,watch amelie wish french
...,...,...,...,...,...,...,...
999995,1,1679383266,Sat May 02 09:13:22 PDT 2009,NO_QUERY,Shawna1976,goodmorning jordan needs to talk to you,goodmorning jordan need talk
999996,1,2053651702,Sat Jun 06 05:16:26 PDT 2009,NO_QUERY,harlequinxgirl,good mprninh to you too,good mprninh
999997,0,1563449159,Sun Apr 19 22:52:58 PDT 2009,NO_QUERY,Cynthi_ocho,cant believe spring break is coming to an end,cant believe spring break come end
999998,0,1751244042,Sat May 09 19:00:02 PDT 2009,NO_QUERY,Ericanderson09,everyone have texted in the last hour complete...,everyone texted last hour completely ignore im...


In [5]:
import tensorflow as tf
import tensorflow.keras
from tensorflow.keras import backend as K

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import metrics as kmetrics
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
import gensim

In [15]:
df = df.dropna(subset=['words'])

In [16]:
# Création et entraînement du modèle Word2Vec
w2v_size=300
w2v_window=5
w2v_min_count=1
w2v_epochs=100

#df_test = df.sample(100)
sentences = df['words'].to_list()

print("Build & train Word2Vec model ...")
w2v_model = gensim.models.Word2Vec(min_count=w2v_min_count, window=w2v_window,
                                                vector_size=w2v_size,
                                                seed=42,
                                                workers=1)
#                                                workers=multiprocessing.cpu_count())
w2v_model.build_vocab(sentences)
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=w2v_epochs)
model_vectors = w2v_model.wv
w2v_words = model_vectors.index_to_key
print("Vocabulary size: %i" % len(w2v_words))
print("Word2Vec trained")

Build & train Word2Vec model ...
Vocabulary size: 61
Word2Vec trained


In [17]:
# Préparation des sentences (tokenization)
maxlen = 24 # adapt to length of sentences

print("Fit Tokenizer ...")
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
x_sentences = pad_sequences(tokenizer.texts_to_sequences(sentences),
                                                     maxlen=maxlen,
                                                     padding='post') 
                                                   
num_words = len(tokenizer.word_index) + 1
print("Number of unique words: %i" % num_words)

Fit Tokenizer ...
Number of unique words: 292434


In [18]:
# Création de la matrice d'embedding

print("Create Embedding matrix ...")
w2v_size = 300
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
embedding_matrix = np.zeros((vocab_size, w2v_size))
i=0
j=0
    
for word, idx in word_index.items():
    i +=1
    if word in w2v_words:
        j +=1
        embedding_vector = model_vectors[word]
        if embedding_vector is not None:
            embedding_matrix[idx] = model_vectors[word]
            
word_rate = np.round(j/i,4)
print("Word embedding rate : ", word_rate)
print("Embedding matrix: %s" % str(embedding_matrix.shape))

Create Embedding matrix ...
Word embedding rate :  0.0001
Embedding matrix: (292434, 300)


In [19]:
# Création du modèle

input=Input(shape=(len(x_sentences),maxlen),dtype='float64')
word_input=Input(shape=(maxlen,),dtype='float64')  
word_embedding=Embedding(input_dim=vocab_size,
                         output_dim=w2v_size,
                         weights = [embedding_matrix],
                         input_length=maxlen)(word_input)
word_vec=GlobalAveragePooling1D()(word_embedding)  
embed_model = Model([word_input],word_vec)

embed_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 24)]              0         
                                                                 
 embedding (Embedding)       (None, 24, 300)           87730200  
                                                                 
 global_average_pooling1d (  (None, 300)               0         
 GlobalAveragePooling1D)                                         
                                                                 
Total params: 87730200 (334.66 MB)
Trainable params: 87730200 (334.66 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [20]:
embeddings = embed_model.predict(x_sentences)
embeddings.shape



(995491, 300)

In [21]:
embeddings

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [25]:
from keras.layers import Embedding, LSTM, Dense, Input, Attention
from keras.models import Model

In [26]:
input_layer = Input(shape=(maxlen,))
embedding_layer = Embedding(input_dim=vocab_size,
                            output_dim=w2v_size,
                            weights=[embedding_matrix],
                            input_length=maxlen,
                            trainable=False)(input_layer)
lstm_layer = LSTM(100, return_sequences=True)(embedding_layer)
attention_layer = Attention()([lstm_layer, lstm_layer])
attention_output = GlobalAveragePooling1D()(attention_layer)
output_layer = Dense(1, activation='sigmoid')(attention_output)

# Créer le modèle
model = Model(inputs=input_layer, outputs=output_layer)

# Compiler le modèle
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [27]:
model.fit(x_sentences, df['target'], epochs=1, batch_size=32, validation_split=0.2)

W0000 00:00:1706179015.836174       1 op_level_cost_estimator.cc:699] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" vendor: "GenuineIntel" model: "110" frequency: 1400 num_cores: 8 environment { key: "cpu_instruction_set" value: "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 32768 l2_cache_size: 262144 l3_cache_size: 6291456 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }




W0000 00:00:1706179438.806258       1 op_level_cost_estimator.cc:699] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" vendor: "GenuineIntel" model: "110" frequency: 1400 num_cores: 8 environment { key: "cpu_instruction_set" value: "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 32768 l2_cache_size: 262144 l3_cache_size: 6291456 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }


KeyboardInterrupt: 