# Task-Movie Reviews Classification

# import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# load the dataset

In [2]:
data=pd.read_csv('IMDB Dataset.csv')
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [3]:
data.isnull().sum()

review       0
sentiment    0
dtype: int64

# sentiment count

In [4]:
data.groupby('sentiment').count()

Unnamed: 0_level_0,review
sentiment,Unnamed: 1_level_1
negative,25000
positive,25000


# Text Data Preprocessing

In [5]:
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters, numeric characters, and punctuation
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    
    # Remove extra white space
    text = ' '.join(text.split())
    
    # Tokenization
    words = word_tokenize(text)
    
    # Removing stopwords and applying stemming
    words = [ps.stem(word) for word in words if word not in stop_words]
    
    return ' '.join(words)


In [6]:
# Apply text preprocessing to a 'review' column 
data['review'] = data['review'].apply(preprocess_text)

In [7]:
data['review_preprocessed']=data['review'].apply(preprocess_text)
data

Unnamed: 0,review,sentiment,review_preprocessed
0,one review mention watch oz episod hook right ...,positive,one review mention watch oz episod hook right ...
1,wonder littl product br br film techniqu unass...,positive,wonder littl product br br film techniqu unass...
2,thought wonder way spend time hot summer weeke...,positive,thought wonder way spend time hot summer weeke...
3,basic famili littl boy jake think zombi closet...,negative,basic famili littl boy jake think zombi closet...
4,petter mattei love time money visual stun film...,positive,petter mattei love time money visual stun film...
...,...,...,...
49995,thought movi right good job creativ origin fir...,positive,thought movi right good job creativ origin fir...
49996,bad plot bad dialogu bad act idiot direct anno...,negative,bad plot bad dialogu bad act idiot direct anno...
49997,cathol taught parochi elementari school nun ta...,negative,cathol taught parochi elementari school nun ta...
49998,go disagre previou comment side maltin one sec...,negative,go disagr previou comment side maltin one seco...


# features and labels

In [8]:
x = data.iloc[:, 0]
y = data.iloc[:, 1]

# dataset splitting

In [9]:
train_x, test_x, train_y, test_y = train_test_split(x, y, random_state = 0)

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [11]:
text_clf=Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])

In [12]:
text_clf.fit(train_x,train_y)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [13]:
predictions=text_clf.predict(test_x)

In [14]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [15]:
print(confusion_matrix(test_y,predictions))

[[5569  722]
 [ 635 5574]]


In [16]:
print(classification_report(test_y,predictions))

              precision    recall  f1-score   support

    negative       0.90      0.89      0.89      6291
    positive       0.89      0.90      0.89      6209

    accuracy                           0.89     12500
   macro avg       0.89      0.89      0.89     12500
weighted avg       0.89      0.89      0.89     12500



In [17]:
print(accuracy_score(test_y,predictions))

0.89144


In [18]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Encode labels: 'positive' as 1, 'negative' as 0

In [19]:
train_y = train_y.apply(lambda x: 1 if x == 'positive' else 0)
test_y = test_y.apply(lambda x: 1 if x == 'positive' else 0)

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_x)

In [20]:
# Convert text to sequences of integers
train_sequences = tokenizer.texts_to_sequences(train_x)
test_sequences = tokenizer.texts_to_sequences(test_x)

# Pad sequences to have the same length
max_sequence_length = 100  # Choose an appropriate sequence length
train_sequences = pad_sequences(train_sequences, maxlen=max_sequence_length)
test_sequences = pad_sequences(test_sequences, maxlen=max_sequence_length)

# Build the LSTM model

In [21]:
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_sequence_length))

model.add(LSTM(64))
model.add(Dense(1, activation='sigmoid'))  # Use 'sigmoid' for binary classification

# Compile the model

In [22]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [23]:
# Train the model
model.fit(train_sequences, train_y, epochs=10, batch_size=64, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2115847ce50>

In [24]:
# Evaluate the model on the test data
loss, accuracy = model.evaluate(test_sequences, test_y)
print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy*100:.2f}%")

Test Loss: 0.9492, Test Accuracy: 84.97%


In [28]:
from sklearn.metrics import confusion_matrix, classification_report


# Make predictions on the test data
predictions = model.predict(test_sequences)

# Convert predicted probabilities to binary labels
predicted_labels = (predictions > 0.5).astype(int)

# Evaluate the model's performance
confusion_mat = confusion_matrix(test_y, predicted_labels)
class_report = classification_report(test_y, predicted_labels)

# Print the confusion matrix
print("Confusion Matrix:")
print(confusion_mat)

Confusion Matrix:
[[5134 1157]
 [ 722 5487]]


In [29]:
# Print the classification report
print("Classification Report:")
print(class_report)

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.82      0.85      6291
           1       0.83      0.88      0.85      6209

    accuracy                           0.85     12500
   macro avg       0.85      0.85      0.85     12500
weighted avg       0.85      0.85      0.85     12500



In [30]:
# save the model
model.save("movie-review.h5")

# model testing 

In [31]:
reviews=["I can't believe I spent money to watch this film."
         "It was a never-ending series of clichés, and the ending was so predictable." 
         "I expected more from the talented cast, but they couldn't save this disaster of a movie."]

In [32]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()


# Fit the CountVectorizer on your training data and transform the data
review= cv.fit_transform(reviews)

In [33]:
print("1 for positive review and 0 for negative review")

1 for positive review and 0 for negative review


In [35]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Assuming you have already loaded and preprocessed your training data and model
# tokenizer should be the same one used during training
# max_sequence_length should match the expected input length of your model

# Define the review text
reviews = [
    "I can't believe I spent money to watch this  film.It was a never-ending series of clichés, and the ending was so predictable.I expected more from the talented cast, but they couldn't save this disaster of a movie."
]

# Tokenize and pad the review text
review_sequences = tokenizer.texts_to_sequences(reviews)
review_sequences_padded = pad_sequences(review_sequences, maxlen=max_sequence_length)

# Convert the padded sequences to a dense NumPy array
review_data = np.array(review_sequences_padded)

# Make predictions on the review data
predictions = model.predict(review_data)

# Interpret the predictions
for i, prediction in enumerate(predictions):
    sentiment = "positive" if prediction > 0.5 else "negative"
    print(f"Review {i+1}: {reviews[i]}\nPredicted Sentiment: {sentiment}\n")


Review 1: I can't believe I spent money to watch this  film.It was a never-ending series of clichés, and the ending was so predictable.I expected more from the talented cast, but they couldn't save this disaster of a movie.
Predicted Sentiment: negative

