### **PIP**

In [1]:
!pip install -q kaggle

In [2]:
!pip install tensorflow --upgrade
!pip install keras --upgrade

Collecting tensorflow
  Downloading tensorflow-2.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
[0mCollecting tensorboard<2.19,>=2.18 (from tensorflow)
  Downloading tensorboard-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Downloading tensorflow-2.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (615.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m615.3/615.3 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tensorboard-2.18.0-py3-none-any.whl (5.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m101.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tensorboard, tensorflow
  Attempting uninstall: tensorboard
    Found existing installation: tensorboard 2.17.1
    Uninstalling tensorboard-2.17.1:
      Successfully uninstalled tensorboard-2.17.1
  Attempting uninstall: tensorflow
    Found existing installation: tensorflow 2.17.1
    Uninstalling tens

### ***Import***

In [3]:
import os
import json
from zipfile import ZipFile
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [4]:
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

### **Data collection from kagge**

In [6]:
kaggle_dictionary = json.load(open("/content/kaggle.json"))

In [7]:
#setup kaggle credential as environment variables
os.environ["KAGGLE_USERNAME"] = kaggle_dictionary["username"]
os.environ["KAGGLE_KEY"] = kaggle_dictionary["key"]

In [8]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
 62% 16.0M/25.7M [00:00<00:00, 165MB/s]
100% 25.7M/25.7M [00:00<00:00, 198MB/s]


In [9]:
!ls

imdb-dataset-of-50k-movie-reviews.zip  kaggle.json  sample_data


In [10]:
with ZipFile("imdb-dataset-of-50k-movie-reviews.zip", "r") as zip_ref:
    zip_ref.extractall()

In [11]:
!ls

'IMDB Dataset.csv'   imdb-dataset-of-50k-movie-reviews.zip   kaggle.json   sample_data


In [12]:
data = pd.read_csv("/content/IMDB Dataset.csv")

In [13]:
data.sentiment.value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [14]:
data.replace({"sentiment":{"positive":1, "negative":0}}, inplace = True)

  data.replace({"sentiment":{"positive":1, "negative":0}}, inplace = True)


In [15]:
#split the training data
train_data, test_data = train_test_split(data, test_size = 0.2, random_state = 42)

In [16]:
print(train_data.shape)
print(test_data.shape)

(40000, 2)
(10000, 2)


### ***Data Preprocessing***

In [None]:
# tokenization text dada
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data["review"])
X_train_seq = tokenizer.texts_to_sequences(train_data["review"])
X_test_seq = tokenizer.texts_to_sequences(test_data["review"])
# Pad the sequences to ensure equal length
X_train = pad_sequences(X_train_seq, maxlen=200)
X_test = pad_sequences(X_test_seq, maxlen=200)

In [None]:
y_train = train_data["sentiment"]
y_test = test_data["sentiment"]

### **LSTM_long short -term memory**

In [36]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

112281


In [46]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))  # input_shape is implied here
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation="sigmoid"))
model.build(input_shape=(None, 200))

In [47]:
model.summary()

In [48]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [49]:
# # Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m183s[0m 348ms/step - accuracy: 0.7314 - loss: 0.5319 - val_accuracy: 0.8226 - val_loss: 0.4115
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m198s[0m 353ms/step - accuracy: 0.8437 - loss: 0.3705 - val_accuracy: 0.8576 - val_loss: 0.3431
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m207s[0m 362ms/step - accuracy: 0.8712 - loss: 0.3162 - val_accuracy: 0.8331 - val_loss: 0.3996
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 326ms/step - accuracy: 0.8782 - loss: 0.3026 - val_accuracy: 0.8503 - val_loss: 0.3699
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 324ms/step - accuracy: 0.8829 - loss: 0.2840 - val_accuracy: 0.8564 - val_loss: 0.3501


In [50]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"test loss: {loss}")
print(f"test accuracy: {accuracy}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 101ms/step - accuracy: 0.8657 - loss: 0.3377
test loss: 0.3381136357784271
test accuracy: 0.8654000163078308


In [53]:
def predict_sentiment(review):
    sequence = tokenizer.texts_to_sequences([review])
    pad_seq = pad_sequences(sequence, maxlen=200)
    prediction = model.predict(pad_seq)
    sentiment = "Positive" if prediction[0][0] >0.5 else "Negative"
    return sentiment

In [63]:
new_review = "fantastic"
sentiment = predict_sentiment(new_review)
print(f"the sentiment of movie is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 126ms/step
the sentiment of movie is: Positive


In [25]:
from tensorflow.keras.preprocessing.text import Tokenizer
sentences = ["This is a cat", "This is a dog", "My name is maran winchester"]
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences)
print(tokenizer.word_index)
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)
padded = pad_sequences(sequences, maxlen=5)
print(padded)

{'is': 1, 'this': 2, 'a': 3, 'cat': 4, 'dog': 5, 'my': 6, 'name': 7, 'maran': 8, 'winchester': 9}
[[2, 1, 3, 4], [2, 1, 3, 5], [6, 7, 1, 8, 9]]
[[0 2 1 3 4]
 [0 2 1 3 5]
 [6 7 1 8 9]]
