1. Importation des bibliothèques

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [5]:
# Charger le dataset
dataset_path = "sentiment100k.csv"
data = pd.read_csv(dataset_path, header=None)

In [7]:
# Renommer les colonnes pour faciliter l'accès
data.columns = ["polarity", "id", "date", "query", "user", "text"]

# Garder uniquement les colonnes nécessaires
data = data[["polarity", "text"]]

# Convertir les labels : 0 (négatif) et 4 (positif) -> 0 et 1
data.loc[:, "polarity"] = data["polarity"].replace({4: 1})

# Nettoyer les doublons et les valeurs manquantes
data = data.drop_duplicates()
data = data.dropna()


In [9]:
# Vérifier les données
print(data.head())

   polarity                                               text
0         0  @switchfoot http://twitpic.com/2y1zl - Awww, t...
1         0  is upset that he can't update his Facebook by ...
2         0  @Kenichan I dived many times for the ball. Man...
3         0    my whole body feels itchy and like its on fire 
4         0  @nationwideclass no, it's not behaving at all....


2. Prétraitement des Textes

In [12]:
import re

In [14]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# Charger les stop words en anglais
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mouma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r"[^a-zA-Z\s]", '', text)
    text = re.sub(r"\s+", ' ', text).strip()
    text = re.sub(r'\@w+|\#', '', text)
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

In [18]:
# Appliquer le nettoyage aux tweets
data["text"] = data["text"].apply(clean_text)

In [19]:
# Vérifier les modifications
print(data.head())

   polarity                                               text
0         0  switchfoot thats bummer shoulda got david carr...
1         0  upset cant update facebook texting might cry r...
2         0  kenichan dived many times ball managed save re...
3         0                   whole body feels itchy like fire
4         0           nationwideclass behaving im mad cant see


3. Tokenisation et Padding

In [23]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [25]:
# Paramètres de tokenisation
max_words = 5000  # Nombre maximal de mots dans le vocabulaire
max_len = 50  # Longueur maximale des séquences

# Initialiser le tokenizer
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(data["text"])

In [27]:
# Convertir le texte en séquences
sequences = tokenizer.texts_to_sequences(data["text"])

In [29]:
# Appliquer le padding
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding="post", truncating="post")

In [31]:
# Vérification
print(padded_sequences[:5])

[[   1   44 1215 3781   15  660    1 1884    4    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0]
 [ 663   12  488  407 2206  208  472 1966   64    9  170 1020    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0]
 [   1    1  204  259 1460 1483  732  345    7    1    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0]
 [ 317  689  340 2602    6 1027    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0 

4. Séparation des Données

In [34]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [50]:
# Diviser les données
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, data["polarity"].values, test_size=0.2, random_state=42)

In [63]:
print(data["polarity"])

0         0
1         0
2         0
3         0
4         0
         ..
199995    1
199996    1
199997    1
199998    1
199999    1
Name: polarity, Length: 198671, dtype: int64


Construction du Modèle

In [65]:
print(data["polarity"].values)

[0 0 0 ... 1 1 1]


In [67]:
# Construction du modèle
model = Sequential([
    Embedding(input_dim=max_words, output_dim=64, input_length=50),
    LSTM(64, return_sequences=False),
    Dense(1, activation='sigmoid')
])



In [68]:
# Compilation du modèle
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [71]:
# Résumé du modèle
model.summary()

5. Entraînement et Évaluation du Modèle

In [74]:
from sklearn.metrics import classification_report

In [76]:
# Entraînement
history = model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test), batch_size=32, verbose=1)

Epoch 1/5
[1m4967/4967[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m249s[0m 45ms/step - accuracy: 0.5031 - loss: 0.6938 - val_accuracy: 0.4993 - val_loss: 0.6932
Epoch 2/5
[1m4967/4967[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 29ms/step - accuracy: 0.5272 - loss: 0.6819 - val_accuracy: 0.7621 - val_loss: 0.4916
Epoch 3/5
[1m4967/4967[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 25ms/step - accuracy: 0.7725 - loss: 0.4749 - val_accuracy: 0.7768 - val_loss: 0.4659
Epoch 4/5
[1m4967/4967[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 26ms/step - accuracy: 0.7921 - loss: 0.4397 - val_accuracy: 0.7788 - val_loss: 0.4680
Epoch 5/5
[1m4967/4967[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 30ms/step - accuracy: 0.8041 - loss: 0.4167 - val_accuracy: 0.7771 - val_loss: 0.4743


In [78]:
# Évaluation des performances
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print(classification_report(y_test, y_pred))

[1m1242/1242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 10ms/step
              precision    recall  f1-score   support

           0       0.79      0.75      0.77     19895
           1       0.76      0.81      0.78     19840

    accuracy                           0.78     39735
   macro avg       0.78      0.78      0.78     39735
weighted avg       0.78      0.78      0.78     39735



6. Prédiction de Sentiment pour de Nouveaux Tweets

In [81]:
def predict_sentiment(model, tokenizer, text):
    # Nettoyer le texte
    cleaned_text = clean_text(text)
    # Tokeniser et appliquer le padding
    sequence = tokenizer.texts_to_sequences([cleaned_text])
    padded_sequence = pad_sequences(sequence, maxlen=max_len, padding="post", truncating="post")
    # Prédire le sentiment
    prediction = model.predict(padded_sequence)
    return "Positive" if prediction[0] > 0.5 else "Negative"

In [83]:
# Exemple d'utilisation
tweet = "I love this product! It's amazing."
print(predict_sentiment(model, tokenizer, tweet))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
Positive


In [87]:
# Exemple d'utilisation
tweet = "I hate this product! It's amazing."
print(predict_sentiment(model, tokenizer, tweet))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 140ms/step
Negative
