<a href="https://colab.research.google.com/github/MouniriOuma/TAR-Feeling-Analysis-on-IMDB-Dataset-using-LSTM/blob/main/TAR10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [None]:
# Charger le dataset
data = pd.read_csv("IMDB.csv")

In [None]:
print(data.head)

<bound method NDFrame.head of                                                   review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]>


In [None]:
# 3. Preprocess the Data
# Rename columns for consistency
data.columns = ["text", "sentiment"]

In [None]:
# Convert sentiment labels to binary: positive -> 1, negative -> 0
data["sentiment"] = data["sentiment"].map({"positive": 1, "negative": 0})


In [None]:
# Drop duplicates and missing values
data = data.drop_duplicates()
data = data.dropna()

In [None]:
import nltk
from nltk.corpus import stopwords

In [None]:
# Download NLTK stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mouma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# 4. Text Cleaning
stop_words = set(stopwords.words('english'))

In [None]:
import re

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r"[^a-zA-Z\s]", '', text)
    text = re.sub(r"\s+", ' ', text).strip()
    text = re.sub(r'\@w+|\#', '', text)
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

In [None]:
# Apply cleaning to the text column
data["text"] = data["text"].apply(clean_text)

In [None]:
# 5. Tokenization and Padding
max_words = 5000  # Maximum number of words in the vocabulary
max_len = 100     # Maximum length of sequences (adjust based on dataset)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [None]:
# Initialize tokenizer
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(data["text"])

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(data["text"])

# Apply padding
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding="post", truncating="post")

In [None]:
# 6. Split the Data
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, data["sentiment"].values, test_size=0.2, random_state=42)

In [None]:
# 7. Build the Model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=64, input_length=max_len),
    LSTM(64, return_sequences=False),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print model summary
model.summary()



In [None]:
# 8. Train the Model
history = model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test), batch_size=32, verbose=1)

Epoch 1/5
[1m1240/1240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 153ms/step - accuracy: 0.5565 - loss: 0.6773 - val_accuracy: 0.7600 - val_loss: 0.5425
Epoch 2/5
[1m1240/1240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m193s[0m 156ms/step - accuracy: 0.6562 - loss: 0.6239 - val_accuracy: 0.6006 - val_loss: 0.6475
Epoch 3/5
[1m1240/1240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m193s[0m 156ms/step - accuracy: 0.7214 - loss: 0.5573 - val_accuracy: 0.8365 - val_loss: 0.3765
Epoch 4/5
[1m1240/1240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m192s[0m 155ms/step - accuracy: 0.8663 - loss: 0.3276 - val_accuracy: 0.8620 - val_loss: 0.3277
Epoch 5/5
[1m1240/1240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m178s[0m 143ms/step - accuracy: 0.8973 - loss: 0.2612 - val_accuracy: 0.8523 - val_loss: 0.3533


In [None]:
from sklearn.metrics import classification_report

In [None]:
# 9. Evaluate the Model
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print(classification_report(y_test, y_pred))

[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step
              precision    recall  f1-score   support

           0       0.81      0.91      0.86      4939
           1       0.90      0.79      0.84      4978

    accuracy                           0.85      9917
   macro avg       0.86      0.85      0.85      9917
weighted avg       0.86      0.85      0.85      9917



In [None]:
# 10. Predict Sentiment for New Text
def predict_sentiment(model, tokenizer, text):
    cleaned_text = clean_text(text)
    sequence = tokenizer.texts_to_sequences([cleaned_text])
    padded_sequence = pad_sequences(sequence, maxlen=max_len, padding="post", truncating="post")
    prediction = model.predict(padded_sequence)
    return "Positive" if prediction[0] > 0.5 else "Negative"

In [None]:
# Example usage
new_review = "This movie was fantastic! I loved every moment of it."
print(predict_sentiment(model, tokenizer, new_review))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
Positive


In [None]:
# Example usage
new_review = "This movie was bad! I hated every moment of it."
print(predict_sentiment(model, tokenizer, new_review))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
Negative
