In [42]:
import os
import json
from zipfile import ZipFile
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding,LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [43]:
import pandas as pd

# Read the CSV file with the correct encoding
file_path = r"C:\Users\HP-PC\Desktop\Python data analytics\IMDM reviews data set\IMDB Dataset.csv"
data = pd.read_csv(file_path, encoding='utf-8')  # You can also try 'ISO-8859-1' or 'latin1' if 'utf-8' doesn't work
print(data.head())


                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [44]:
print("Unique values in 'sentiment' before mapping:", data['sentiment'].unique())



Unique values in 'sentiment' before mapping: ['positive' 'negative']


In [45]:
data = data.dropna()

In [46]:
data['sentiment'] = data['sentiment'].map({'positive': 1, 'negative': 0})


In [47]:
texts = data['review']
labels = data['sentiment'].map({'[positive':1,'negative':0}).values

In [48]:
if data['sentiment'].isnull().any():
    print("There are invalid values in the 'sentiment' column. Cleaning them.")
    data = data.dropna(subset=['sentiment'])

In [49]:
print("Missing values after cleaning:")
print(data.isnull().sum())

Missing values after cleaning:
review       0
sentiment    0
dtype: int64


In [50]:
texts = data['review']
labels = data['sentiment'].values

In [51]:
if pd.isnull(labels).any():
    print("there are still NaN values in the labels. Exiting.")
    

In [52]:
x_train,x_test,y_train,y_test = train_test_split(
    texts,labels,test_size=0.2,random_state = 42,stratify = labels
)

In [53]:
tokenizer = Tokenizer(num_words=10000,oov_token="<OOV>")

In [54]:
tokenizer.fit_on_texts(x_train)

In [55]:
x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)

In [56]:
max_seq_length = 200
x_train_padded = pad_sequences(x_train_seq,maxlen=max_seq_length,padding='post',truncating='post')
x_test_padded = pad_sequences(x_test_seq,maxlen=max_seq_length,padding='post',truncating='post')


In [57]:
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_seq_length),  
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])



In [58]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [59]:
print("Model summary:")
model.build(input_shape=(None,max_seq_length))
model.summary()

Model summary:


In [60]:
model.compile(optimizer="adam",loss="binary_crossentropy",metrics=["accuracy"])

In [61]:
model.fit(x_train_padded, y_train, epochs=5, batch_size=64, validation_split=0.2)


Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 164ms/step - accuracy: 0.5188 - loss: 0.6900 - val_accuracy: 0.5865 - val_loss: 0.6922
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 159ms/step - accuracy: 0.6059 - loss: 0.6645 - val_accuracy: 0.7446 - val_loss: 0.5427
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 161ms/step - accuracy: 0.6830 - loss: 0.6063 - val_accuracy: 0.7674 - val_loss: 0.5122
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 168ms/step - accuracy: 0.7968 - loss: 0.4480 - val_accuracy: 0.8630 - val_loss: 0.3333
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 166ms/step - accuracy: 0.8944 - loss: 0.2871 - val_accuracy: 0.8725 - val_loss: 0.3197


<keras.src.callbacks.history.History at 0x1d4af289f50>

In [64]:
loss,accuracy = model.evaluate(x_test_padded,y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy:{accuracy}")


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 32ms/step - accuracy: 0.8673 - loss: 0.3388
Test Loss: 0.3279377222061157
Test Accuracy:0.8689000010490417


In [71]:
def predict_sentiment(review):
    sequence = tokenizer.texts_to_sequences([review])
    padded_sequence = pad_sequences(sequence,maxlen = 200)
    prediction = model.predict(padded_sequence)
    sentiment =  "positive" if prediction[0][0]>0.5 else "negative"
    return sentiment

In [72]:
new_review = "this movie was fantastic. I loved it."
sentiment = predict_sentiment(new_review)
print(f"the sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
the sentiment of the review is: positive


In [73]:
new_review = "The movie was not that good"
sentiment = predict_sentiment(new_review)
print(f"the sentiment of the review is:{sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
the sentiment of the review is:negative
