In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
import pandas as pd
import sklearn
import warnings
import transformers
import tensorflow as ts
import numpy as np
import seaborn as sns
import tensorflow as tf 

In [None]:
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv("data.csv")

In [None]:
data.head()

In [None]:
data.drop('model',axis=1,inplace = True)

In [None]:
data.tail()

In [None]:
# Check for missing values

missing_labels = data["sentiment"].isnull().sum()

In [None]:
print(f'number of missing labels: {missing_labels}')

In [None]:
missing_rows = data[data['sentiment'].isnull()]

In [None]:
print(missing_rows)

In [None]:
data.dropna(subset = ['sentiment'],inplace=True)
data = data.reset_index(drop=True)

In [None]:
print(data['sentiment'].isnull().sum())

In [None]:
print(data['review'].isnull().sum())

In [None]:
# Encode sentiment labels as integers
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data['sentiment'] = le.fit_transform(data['sentiment'])

In [None]:
data.sample(5)

In [None]:
y = data['sentiment'].values
x = data['review'].values

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Convert text to sequences of integers (word IDs)
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(x)

maxlen = max(len(seq) for seq in sequences)

In [None]:
import pickle
# Save tokenizer 
with open("tokenizer.pkl", "wb") as f: pickle.dump(tokenizer, f)

In [None]:
from sklearn.model_selection import KFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Parameters
num_folds = 5
vocab_size = 20000
embedding_dim = 128
batch_size = 16
epochs = 3
maxlen = 64
num_classes = len(np.unique(y))

# Convert texts to sequences
sequences = tokenizer.texts_to_sequences(x)  # x should be a list/Series of texts
x_pad = pad_sequences(sequences, maxlen=maxlen, padding='post')

# Convert labels to categorical
y_cat = to_categorical(y, num_classes=num_classes)

# Define KFold
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
fold_no = 1
acc_per_fold = []
loss_per_fold = []

for train_index, val_index in kf.split(x_pad):
    print(f"Training fold {fold_no}...")

    # Split the data
    x_train, x_val = x_pad[train_index], x_pad[val_index]
    y_train, y_val = y_cat[train_index], y_cat[val_index]

    # Define LSTM model
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen))
    model.add(LSTM(128, return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(num_classes, activation='softmax'))

    # Compile
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # Train
    history = model.fit(
        x_train, y_train,
        validation_data=(x_val, y_val),
        epochs=epochs,
        batch_size=batch_size,
        verbose=1
    )

    # Evaluate
    scores = model.evaluate(x_val, y_val, verbose=0)
    print(f"Score for fold {fold_no}: loss={scores[0]:.4f}, accuracy={scores[1]:.4f}")

    acc_per_fold.append(scores[1])
    loss_per_fold.append(scores[0])
    fold_no += 1


In [None]:
import pickle 
# Save the entire model 
model.save("my_LSTM_model") 

In [None]:
#Summary of results:
print("Average scores for all folds:")
print(f"Accuracy: {np.mean(acc_per_fold):.4f} (+/- {np.std(acc_per_fold):.4f})")
print(f"Loss: {np.mean(loss_per_fold):.4f}")

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

text = "The movie was amazing!"

# Tokenize using your LSTM tokenizer
seq = tokenizer.texts_to_sequences([text])

# Pad the sequence to maxlen (same as training)
padded_seq = pad_sequences(seq, maxlen=maxlen, padding='post')

# Get predictions from LSTM model
pred_probs = model.predict(padded_seq)  # shape: (1, num_classes)


print("Predicted class:", pred_class)

In [None]:
label_mapping = dict(zip(le.classes_,le.transform(le.classes_)))
print(label_mapping)

In [None]:
# Classification report
from sklearn.metrics import classification_report
# 1) Pad validation set
x_val_padded = pad_sequences(x_val, maxlen=maxlen, padding='post')

# 2) Predict probabilities
y_pred_probs = model.predict(x_val_padded)


print(classification_report(y_val, y_pred, target_names=['negative', 'neutral', 'positive']))

In [None]:
from sklearn.metrics import confusion_matrix
# Confusion matrix (prettier version)
cm = confusion_matrix(y_val, y_pred)
cm_df = pd.DataFrame(cm, index=['negative', 'neutral', 'positive'], columns=['negative', 'neutral', 'positive'])

plt.figure(figsize=(6,4))
sns.heatmap(cm_df, annot=True, fmt="d", cmap="Greens")
plt.ylabel("Actual")
plt.xlabel("Predicted")
plt.title("Confusion Matrix - LSTM")
plt.show()


In [None]:
from sklearn.metrics import f1_score
f1_macro = f1_score(y_val, y_pred, average="macro")
f1_micro = f1_score(y_val, y_pred, average="micro")  # overall F1, weighted by number of samples per class
f1_weighted = f1_score(y_val, y_pred, average="weighted")  # averages per class, weighted by support

In [None]:
print("LSTM Model F1 Scores:")
print(f"Micro F1: {f1_micro:.4f}")
print(f"Macro F1: {f1_macro:.4f}")
print(f"Weighted F1: {f1_weighted:.4f}")