In [None]:
import sys
# Uninstall potential conflicting packages
!{sys.executable} -m pip uninstall -y scikit-learn sklearn
# Reinstall tensorflow and scikit-learn
!{sys.executable} -m pip install tensorflow scikit-learn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from sklearn.metrics import classification_report, confusion_matrix

print("Libraries imported successfully!")

In [None]:
# Load Data
file_path = r"D:\Projects\tentier-streamlit\data\PRDECT-ID Dataset.csv"
df = pd.read_csv(file_path)
print(df.head())
print(df['Sentiment'].value_counts())

In [None]:
# Preprocessing
df = df[['Customer Review', 'Sentiment']].dropna()
X = df['Customer Review'].astype(str)
y = df['Sentiment']

# Encode Labels
le = LabelEncoder()
y = le.fit_transform(y)
print("Classes:", le.classes_)

# Tokenization
max_words = 5000
max_len = 100
tokenizer = Tokenizer(num_words=max_words, lower=True)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=max_len)

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Build LSTM Model
embedding_dim = 128
num_classes = len(np.unique(y))

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_len))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))

if num_classes == 2:
    model.add(Dense(1, activation='sigmoid'))
    loss = 'binary_crossentropy'
else:
    model.add(Dense(num_classes, activation='softmax'))
    loss = 'sparse_categorical_crossentropy'

model.compile(loss=loss, optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:
# Train Model
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test), verbose=1)

In [None]:
# Evaluate Model
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f'Test Accuracy: {accuracy}')

y_pred = model.predict(X_test)
if num_classes == 2:
    y_pred_classes = (y_pred > 0.5).astype(int)
else:
    y_pred_classes = np.argmax(y_pred, axis=1)

print(classification_report(y_test, y_pred_classes, target_names=le.classes_))

In [None]:
# Plot Training History
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.legend()
plt.title('Accuracy')

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.title('Loss')
plt.show()

In [None]:
# Export Model and Tokenizer
import pickle
import os

models_dir = r'D:\Projects\tentier-streamlit\models'
if not os.path.exists(models_dir):
    os.makedirs(models_dir)

# Save Model
model.save(os.path.join(models_dir, 'sentiment_model.h5'))
print(f"Model saved to {os.path.join(models_dir, 'sentiment_model.h5')}")

# Save Tokenizer
with open(os.path.join(models_dir, 'tokenizer.pickle'), 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
print(f"Tokenizer saved to {os.path.join(models_dir, 'tokenizer.pickle')}")