## Build an NLP Model for Sentiment Analysis Using TensorFlow

In [4]:
#1. load dataset
import numpy as np
import pandas as pd

# Load the dataset
df = pd.read_csv('IMDB\IMDB Dataset.csv')
df.sample(5)

  df = pd.read_csv('IMDB\IMDB Dataset.csv')


Unnamed: 0,review,sentiment
12517,This film was the worst film I have ever viewe...,negative
4236,"This Metro film is episodic, but nearly a cons...",positive
17034,"""Going Berserk"" is actually one of the funnies...",positive
29286,I always wondered what happened with that magi...,positive
46344,"Joe Don Baker is...Thomas Jefferson Geronimo, ...",negative


In [6]:
!pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.18.0-cp312-cp312-win_amd64.whl.metadata (3.3 kB)
Collecting tensorflow-intel==2.18.0 (from tensorflow)
  Downloading tensorflow_intel-2.18.0-cp312-cp312-win_amd64.whl.metadata (4.9 kB)
Collecting absl-py>=1.0.0 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorf

In [7]:
#2. Data Cleaning, Tokenization and Preprocessing
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Function to clean text
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabet characters
    return text.lower().strip()

# Clean the reviews
df['review'] = df['review'].apply(clean_text)

# Tokenization and padding
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(df['review'])
sequences = tokenizer.texts_to_sequences(df['review'])
padded_sequences = pad_sequences(sequences, maxlen=200)

Explanation:

We cleaned the text by removing HTML tags and special characters.
We tokenized the text, converting it into numerical sequences.
We padded the sequences to ensure they all have the same length (200 words).

In [8]:
3. # Convert sentiment labels to binary
df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

# Splitting the data into features (X) and labels (y)
X = padded_sequences
y = df['sentiment'].values

In [9]:
# 4. Splitting the Data into Training and Testing Sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#5. Building the Neural Network with TensorFlow
#create a simple neural network with an Embedding layer, followed by two LSTM layers, and a Dense output layer.
import tensorflow as tf

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(10000, 16, input_length=200),
    tf.keras.layers.LSTM(64, return_sequences=True),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

history = model.fit(X_train, y_train, epochs=10, validation_split=0.2)



Epoch 1/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m302s[0m 289ms/step - accuracy: 0.6992 - loss: 0.5602 - val_accuracy: 0.8165 - val_loss: 0.4178
Epoch 2/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m234s[0m 234ms/step - accuracy: 0.8687 - loss: 0.3236 - val_accuracy: 0.8544 - val_loss: 0.3414
Epoch 3/10
[1m 123/1000[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m3:15[0m 223ms/step - accuracy: 0.9106 - loss: 0.2338

Model Explanation
Embedding Layer: Converts word indices into dense vectors of fixed size (16 dimensions).
LSTM Layers: These layers capture patterns in the text over sequences of words.
Dense Layer: Reduces the dimensionality of the features.
Output Layer: Uses the sigmoid activation function to predict the probability of being positive.

In [None]:
#6. Visualizing Model Performance
import matplotlib.pyplot as plt

plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
# 7. Evaluating the Model

test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {test_accuracy:.2f}')

In [None]:
#8. Making Predictions

sample_reviews = [
    "I absolutely loved this movie! The plot was thrilling and the characters were so well developed.",
    "The film was a disaster. Poor acting and a predictable storyline."
]

sample_sequences = tokenizer.texts_to_sequences(sample_reviews)
sample_padded = pad_sequences(sample_sequences, maxlen=200)

predictions = model.predict(sample_padded)
print(["Positive" if prob > 0.5 else "Negative" for prob in predictions])