# Imports

In [1]:
import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

import tensorflow as tf

import re

import nltk

from nltk.corpus import stopwords

from nltk.stem import WordNetLemmatizer

from nltk.tokenize import word_tokenize,sent_tokenize

from tensorflow.keras.layers import Dense, Flatten, Conv2D, Dropout, BatchNormalization, MaxPooling2D, Embedding, LSTM, Bidirectional

from tensorflow.keras.models import Sequential

from tensorflow.keras.preprocessing.text import Tokenizer

from tensorflow.keras.preprocessing.sequence import pad_sequences

# Data

In [2]:
!kaggle datasets download -d yasserh/imdb-movie-ratings-sentiment-analysis

  pid, fd = os.forkpty()


Dataset URL: https://www.kaggle.com/datasets/yasserh/imdb-movie-ratings-sentiment-analysis
License(s): CC0-1.0
Downloading imdb-movie-ratings-sentiment-analysis.zip to /kaggle/working
 24%|█████████▏                            | 5.00M/20.6M [00:00<00:00, 52.3MB/s]
100%|███████████████████████████████████████| 20.6M/20.6M [00:00<00:00, 117MB/s]


In [3]:
!unzip imdb-movie-ratings-sentiment-analysis.zip

Archive:  imdb-movie-ratings-sentiment-analysis.zip
  inflating: movie.csv               


In [4]:
df=pd.read_csv('movie.csv')

df.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


### Here that is a data classification so i dont need the sequence

### so what i will do after cleaning:

### 1- Tokenize and remove punctuation

### 2- Remove stop words

### 3- lemmatization

### 4- do TF-IDF

### 5- Modeling

### cleaning data

In [5]:
df.isna().sum()

text     0
label    0
dtype: int64

In [6]:
df.duplicated().sum()

277

In [7]:
df.drop_duplicates(inplace=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 39723 entries, 0 to 39999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    39723 non-null  object
 1   label   39723 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 931.0+ KB


In [9]:
df.columns

Index(['text', 'label'], dtype='object')

In [10]:
df['label'].value_counts() ### to check data imbalance

label
1    19908
0    19815
Name: count, dtype: int64

In [11]:
df['label'].unique()

array([0, 1])

In [12]:
nltk.download('punkt')

nltk.download('stopwords')

nltk.download('wordnet')
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


  pid, fd = os.forkpty()


Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/README  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.sense  
  inflating: /usr

In [13]:
stop_words = set(stopwords.words('english'))

lemmatizer = WordNetLemmatizer()



def clean_text(text):

    if isinstance(text, list):

        text = " ".join(text)

    text = text.lower()

    text = re.sub(r"[^a-z!?',.\s ]", "", text)

    text = re.sub(r'\b(`|``)\b', '"', text)

    text = re.sub(r"\s+", " ", text)

    text = re.sub(r"\b(im)\b","i'm",text)

    text = re.sub(r'\b(br|b)\b', '', text)

    text = re.sub(r"\b(\s's)/b","'s",text)

    tokens = word_tokenize(text)

    tokens = [token for token in tokens if token not in stop_words]

    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    cleaned_text = " ".join(tokens)

    return cleaned_text


In [14]:
for index, row in df.iterrows():

    df.loc[index, 'text'] = clean_text(row['text'])

In [15]:
df['text'][39995]

"western union something forgotten classic western ! perhaps reason lie fact unavailability dvd united state . however , lost appeared region england . blessing way incongruous totally ironic one considers movie depicting founding establishment uniquely american organization western union telegraph company without region release . beggar belief ! simply n't make sense ! produced fox western union directed fritz lang . second occasion great german director undertook direct western ! done excellent job year fox 's return frank james would one western outing splendid rancho notorious . lang ford hawk western union turned fine solid western hold well . beautifully photographed early three strip technicolor edward cronjager boasted good cast headed robert young , randolph scott dean jagger . female lead taken virginia gilmore really little picture . actress never made anything career . presence merely cosmetic . curious robert young top billing scott ! clearly scott 's picture beginning fir

# train-test-split

In [16]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(df['text'],df['label'],test_size=0.2,random_state=42)

# Prepare data for Embedding layer

### 1-Tokenizer

### 2-sequences

### 3-maxsequence

### 4-padding

### 5-model

In [17]:
from tensorflow.keras.preprocessing.text import Tokenizer

from tensorflow.keras.preprocessing.sequence import pad_sequences

In [18]:
tokenzier=Tokenizer()

tokenzier.fit_on_texts(df['text'])

length=len(tokenzier.word_index) + 1

seq_train=tokenzier.texts_to_sequences(df['text'])

max_len=max([len(i) for i in seq_train])

padded_sequences = pad_sequences(seq_train, maxlen=max_len, padding='post')

df['padded_text'] = list(padded_sequences)


In [19]:
length,max_len

(119511, 1451)

In [20]:
x = np.array(padded_sequences)

y = np.array(df['label'])

from sklearn.model_selection import train_test_split



X_train, X_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)

In [21]:
from tensorflow.keras.regularizers import l2 
model = Sequential()

model.add(Embedding(input_dim=length, output_dim=100, input_length=max_len,mask_zero=True))

model.add(LSTM(32,kernel_regularizer=l2(0.01)))

model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])



In [22]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32)

Epoch 1/10
[1m994/994[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 24ms/step - accuracy: 0.7800 - loss: 0.6714 - val_accuracy: 0.8802 - val_loss: 0.3112
Epoch 2/10
[1m994/994[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 24ms/step - accuracy: 0.9367 - loss: 0.1916 - val_accuracy: 0.8777 - val_loss: 0.3236
Epoch 3/10
[1m994/994[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 24ms/step - accuracy: 0.9660 - loss: 0.1112 - val_accuracy: 0.8675 - val_loss: 0.3645
Epoch 4/10
[1m994/994[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 24ms/step - accuracy: 0.9807 - loss: 0.0726 - val_accuracy: 0.8696 - val_loss: 0.4032
Epoch 5/10
[1m994/994[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 24ms/step - accuracy: 0.9856 - loss: 0.0555 - val_accuracy: 0.8607 - val_loss: 0.3921
Epoch 6/10
[1m994/994[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 24ms/step - accuracy: 0.9907 - loss: 0.0404 - val_accuracy: 0.8643 - val_loss: 0.5390
Epoch 7/10
[1m9

<keras.src.callbacks.history.History at 0x7ef5427945b0>

In [23]:
loss, accuracy = model.evaluate(X_val, y_val)

print(f'Validation Accuracy: {accuracy:.2f}')

[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.8574 - loss: 0.6256
Validation Accuracy: 0.86


In [24]:
def predict(new_text):
    cleaned_new_text = clean_text(new_text)
    new_sequence = tokenzier.texts_to_sequences([cleaned_new_text])
    new_padded_sequence = pad_sequences(new_sequence, maxlen=max_len,padding='post')
    prediction = model.predict(new_padded_sequence)
    if prediction[0][0]>0.5:
        return 'Good Review'
    else:
        return 'Bad Review'


In [25]:
predict('the film is too good')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 133ms/step


'Good Review'

In [26]:
predict('the film is too bad')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step


'Bad Review'

In [27]:
predict('Directing is very poor')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step


'Bad Review'

# Saving The Model

In [28]:
import pickle

# Save the tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenzier, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [29]:
from keras.models import load_model

# Save the model
model.save('movie_review_model.h5')
