# Fine Tuning with IMDB and Amazon Dataset LAB-7
* Namansh Singh Maurya
* 22MIA1034
* Lab_7


# Import libraries

In [1]:
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Parameters

In [2]:
max_features = 2000
max_len = 200
embedding_dim = 128
lstm_unis = 128
batch_size = 64
epochs = 5

#Training and Testing data with labels

In [3]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
x_train = pad_sequences(x_train, maxlen=max_len)
x_test = pad_sequences(x_test, maxlen=max_len)

# Model Building

In [4]:
model = Sequential([
    Embedding(max_features, embedding_dim, input_length=max_len),
    LSTM(lstm_unis,dropout = 0.2, recurrent_dropout = 0.2),
    Dense(1, activation='sigmoid')
])



# Setting Objective function and Optimization (building blocks of DL)

In [5]:
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

# Training the Model

In [6]:
model.fit(x_train, y_train, batch_size = batch_size, epochs = epochs, validation_data = (x_test, y_test))

Epoch 1/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m325s[0m 803ms/step - accuracy: 0.6941 - loss: 0.5646 - val_accuracy: 0.8226 - val_loss: 0.3999
Epoch 2/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m345s[0m 876ms/step - accuracy: 0.8198 - loss: 0.4077 - val_accuracy: 0.8482 - val_loss: 0.3600
Epoch 3/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m356s[0m 810ms/step - accuracy: 0.8259 - loss: 0.4020 - val_accuracy: 0.8456 - val_loss: 0.3693
Epoch 4/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m336s[0m 846ms/step - accuracy: 0.8466 - loss: 0.3618 - val_accuracy: 0.8595 - val_loss: 0.3372
Epoch 5/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m352s[0m 901ms/step - accuracy: 0.8656 - loss: 0.3257 - val_accuracy: 0.8508 - val_loss: 0.3501


<keras.src.callbacks.history.History at 0x7d368b864490>

# Saving the model as a pickle file

In [7]:
model.save('/content/lstm_imdb.h5')



In [8]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np

# Loading the model and adding layers to it

In [9]:
imdb_model = tf.keras.models.load_model('/content/lstm_imdb.h5')
num_classes = 1
input_layer = imdb_model.layers[0].input # Oth layer will have the info of the input info
x = imdb_model.layers[-2].output
output_layer = tf.keras.layers.Dense(num_classes, activation='sigmoid')(x)



# Making a fine tuned model

In [10]:
fine_tuned_model = tf.keras.Model(inputs = input_layer, outputs = output_layer)

# Compiling the new model

In [11]:
fine_tuned_model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

# Text Processing

In [12]:
d = pd.read_csv('/content/test.csv')
reviews = d['reviewText'].str.lower().str.replace('[^a-zA-Z0-9 ]', '',regex = True)
categories = d['category']
tokenizer = Tokenizer(num_words = 2000, oov_token = '<oov>')
tokenizer.fit_on_texts(reviews)
sequences = tokenizer.texts_to_sequences(reviews)
x_train = pad_sequences(sequences, maxlen = 200)
y_train = np.array(categories)

FileNotFoundError: [Errno 2] No such file or directory: '/content/test.csv'

# Training the model

In [None]:
fine_tuned_model.fit(x_train, y_train, batch_size = 64, epochs = 5, validation_split = 0.2)

# Saving the new model

In [None]:
fine_tuned_model.save('/content/new_model.h5')

In [None]:
# Load models
old_model = tf.keras.models.load_model('/content/lstm_imdb.h5')
new_model = tf.keras.models.load_model('/content/new_model.h5')

# Preprocess test data
sequences_test = tokenizer.texts_to_sequences(reviews)
x_test = pad_sequences(sequences_test, maxlen=200)
y_test = np.array(categories)

# Evaluate both models
old_loss, old_acc = old_model.evaluate(x_test, y_test)
new_loss, new_acc = new_model.evaluate(x_test, y_test)

# Display results
print(f"Old Model - Loss: {old_loss:.4f}, Accuracy: {old_acc:.4f}")
print(f"New Model - Loss: {new_loss:.4f}, Accuracy: {new_acc:.4f}")

In [None]:
# Sample random reviews for testing
sample_reviews = [
"This product is amazing! I love it.",
"Super quality, completely enjoyed it.",
"It's okay, not the best but does the job.",
"Absolutely fantastic! Highly recommend.",
"Worst purchase ever, waste of money."
]

# Preprocess the sample reviews
sample_sequences = tokenizer.texts_to_sequences(sample_reviews)
sample_padded = pad_sequences(sample_sequences, maxlen=200)

# Get predictions from both models
old_predictions = old_model.predict(sample_padded)
new_predictions = new_model.predict(sample_padded)

# Function to convert prediction values to sentiment labels
def interpret_prediction(pred):
  return "Positive" if pred >= 0.5 else "Negative"
# Display results

for i, review in enumerate(sample_reviews):
  old_sentiment = interpret_prediction(old_predictions[i][0])
  new_sentiment = interpret_prediction(new_predictions[i][0])
  print(f"Review: {review}")
  print(f"Old Model Prediction: {old_predictions[i][0]:.4f} ({old_sentiment})")
  print(f"New Model Prediction: {new_predictions[i][0]:.4f} ({new_sentiment})")
  print("-" * 50)