In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
import tensorflow as tf
import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec
import joblib

In [2]:
data = pd.read_csv("Data/Cleaned.csv")
data

Unnamed: 0,cleaned_reviews,sentiment
0,one reviewer mentioned watching oz episode you...,1
1,wonderful little production filming technique ...,1
2,thought wonderful way spend time hot summer we...,1
3,basically there family little boy jake think t...,0
4,petter matteis love time money visually stun...,1
...,...,...
49995,thought movie right good job wasnt creative or...,1
49996,bad plot bad dialogue bad acting idiotic direc...,0
49997,catholic taught parochial elementary school nu...,0
49998,im going disagree previous comment side maltin...,0


In [None]:
#embedding matrix
# vec = Word2Vec(st,vector_size=100,min_count=1)
# embed =[[vec.wv[word] for word in sent] for sent in st]
# vec_idx = [[vec.wv.key_to_index[word] for word in sent] for sent in st]
# pad = pad_sequences(vec_idx,padding="post")

In [3]:
tokens = Tokenizer()
tokens.fit_on_texts(data["cleaned_reviews"])
ts = tokens.texts_to_sequences(data["cleaned_reviews"])
st = tokens.sequences_to_texts(ts)
pad = pad_sequences(ts,maxlen=500,padding="post") #from KDE plot we set maxlen=500
y = np.array(data["sentiment"])

In [4]:
x_train,x_test,y_train,y_test = train_test_split(pad,y,train_size=0.9,test_size=0.1,random_state=12)
x_test_a, x_val,y_test_a, y_val = train_test_split(x_test,y_test,test_size=0.05,random_state=12)

In [7]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(output_dim=100,input_dim=206577,input_length=500),
    tf.keras.layers.LSTM(256,return_sequences=True),
    tf.keras.layers.LSTM(128,return_sequences=True),
    tf.keras.layers.LSTM(64,return_sequences=True),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(32,activation="relu"),
    tf.keras.layers.Dense(16,activation="relu"),
    tf.keras.layers.Dense(1,activation='sigmoid'),

])
compile = model.compile(optimizer=tf.keras.optimizers.Adam(),loss="binary_crossentropy",metrics=["accuracy"])
final = model.fit(pad,y,batch_size=512,epochs=5,validation_data=(x_val,y_val))
history = final.history


Epoch 1/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m782s[0m 8s/step - accuracy: 0.5423 - loss: 0.6977 - val_accuracy: 0.8480 - val_loss: 0.3152
Epoch 2/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m783s[0m 8s/step - accuracy: 0.8825 - loss: 0.2909 - val_accuracy: 0.9560 - val_loss: 0.1480
Epoch 3/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m786s[0m 8s/step - accuracy: 0.9539 - loss: 0.1291 - val_accuracy: 0.9720 - val_loss: 0.0920
Epoch 4/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m794s[0m 8s/step - accuracy: 0.9772 - loss: 0.0690 - val_accuracy: 0.9840 - val_loss: 0.0479
Epoch 5/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m814s[0m 8s/step - accuracy: 0.9908 - loss: 0.0321 - val_accuracy: 0.9960 - val_loss: 0.0120


In [13]:
model.summary()

In [14]:
history

{'accuracy': [0.641040027141571,
  0.8942599892616272,
  0.9518600106239319,
  0.9774799942970276,
  0.9885200262069702],
 'loss': [0.5954894423484802,
  0.26126575469970703,
  0.13336990773677826,
  0.0693124383687973,
  0.03679322451353073],
 'val_accuracy': [0.8479999899864197,
  0.9559999704360962,
  0.972000002861023,
  0.984000027179718,
  0.9959999918937683],
 'val_loss': [0.3151867091655731,
  0.1479690670967102,
  0.09200851619243622,
  0.04785173013806343,
  0.011971969157457352]}

In [16]:
pred= model.predict(x_test)

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 300ms/step


In [27]:
model.evaluate(x_test,y_test)

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 285ms/step - accuracy: 0.9980 - loss: 0.0088


[0.011192488484084606, 0.9973999857902527]

## New Predictions

In [5]:
st = "Eternal Skies was honestly a beautiful surprise! The story was emotional and inspiring, and the performances felt so genuine. The cinematography was breathtaking — every frame looked like a painting. The background score perfectly matched the tone, giving me chills during some scenes. I left the theater feeling hopeful and completely satisfied. Highly recommend watching it!"
f = st.split(" ")
new_st = [" ".join(f)]
new_ts = tokens.texts_to_sequences(new_st)
new_ts = pad_sequences(new_ts,maxlen=500,padding="post")
predict = model.predict(new_ts) #lets use threshold 0.3 or 0.4

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 126ms/step


In [113]:
joblib.dump(tokens,"Models/Tokens.pkl")

['Models/Tokens.pkl']

In [114]:
joblib.dump(model,"Models/Model.pkl")

['Models/Model.pkl']