In [12]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

## Loading the dataset

In [13]:
df = pd.read_csv('IMDB_Movie_Review.csv')

In [14]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [15]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [16]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [17]:
# Create a mapping dictionary
label_mapping = {'positive': 1, 'negative': 0}

# Apply the mapping to the 'sentiment' column
df['sentiment'] = df['sentiment'].map(label_mapping)

## Performing NLP Tasks

In [11]:
import nltk
nltk.download('punkt_tab')

# Tokenization

def tokenize(text):
    tokens = word_tokenize(text)
    return tokens

# Stopwords

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = [i for i in text if i not in stop_words]
    return words

# Stemming

def stem_words(text):
    ps = PorterStemmer()
    words = [ps.stem(i) for i in text]
    return words

# Handling special tokens

def remove_special_tokens(text):
    words = [i for i in text if i.isalnum()]
    return words

# # Vectorization

# def vectorize(text):
#     vectorizer = CountVectorizer()
#     X = vectorizer.fit_transform
#     return X

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


In [12]:
df['review'] = df['review'].apply(tokenize)
df['review'] = df['review'].apply(remove_stopwords)
df['review'] = df['review'].apply(stem_words)
df['review'] = df['review'].apply(remove_special_tokens)

In [13]:
# Dealing with imbalanced data

df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [20]:
# Using Word2Vec to convert words to vectors

from gensim.models import Word2Vec

# Create a Word2Vec model
model = Word2Vec(df['review'], min_count=1)

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [19]:
import numpy

## Using BERT to vectorise the text

In [7]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("Device:", torch.cuda.get_device_name(0))

CUDA available: True
Device: NVIDIA GeForce RTX 3060 Laptop GPU


In [8]:
from transformers import BertTokenizer, BertModel
import torch

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').cuda()

  from .autonotebook import tqdm as notebook_tqdm


In [35]:
texts = df['review'].tolist()
labels = df['sentiment'].tolist() 

In [30]:

# Clear unused cache
torch.cuda.empty_cache()

In [34]:
from tqdm import tqdm 

def get_bert_embeddings_batch(texts, batch_size=16):
    all_embeddings = []
    num_batches = len(texts) // batch_size + 1  # Calculate number of batches
    
    # Split the texts into batches
    for i in range(0, len(texts), batch_size):
        batch_texts = [str(text) for text in texts[i:i+batch_size]]  # Ensure each text is a string        
        
        # Print progress for each batch
        print(f"Processing batch {i // batch_size + 1}/{num_batches}")
        
        # Tokenize and encode the batch for BERT
        inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=128, return_tensors='pt')

        # Move inputs to GPU
        inputs = {key: value.to('cuda') for key, value in inputs.items()}
        
        # Get embeddings from BERT
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Use the [CLS] token embedding for the entire batch
        cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        all_embeddings.extend(cls_embeddings)
    
    return all_embeddings

# Get BERT embeddings for all texts in batches
bert_embeddings = get_bert_embeddings_batch(texts)


Processing batch 1/3126
Processing batch 2/3126
Processing batch 3/3126
Processing batch 4/3126
Processing batch 5/3126
Processing batch 6/3126
Processing batch 7/3126
Processing batch 8/3126
Processing batch 9/3126
Processing batch 10/3126
Processing batch 11/3126
Processing batch 12/3126
Processing batch 13/3126
Processing batch 14/3126
Processing batch 15/3126
Processing batch 16/3126
Processing batch 17/3126
Processing batch 18/3126
Processing batch 19/3126
Processing batch 20/3126
Processing batch 21/3126
Processing batch 22/3126
Processing batch 23/3126
Processing batch 24/3126
Processing batch 25/3126
Processing batch 26/3126
Processing batch 27/3126
Processing batch 28/3126
Processing batch 29/3126
Processing batch 30/3126
Processing batch 31/3126
Processing batch 32/3126
Processing batch 33/3126
Processing batch 34/3126
Processing batch 35/3126
Processing batch 36/3126
Processing batch 37/3126
Processing batch 38/3126
Processing batch 39/3126
Processing batch 40/3126
Processin

In [6]:
import pickle

# Save embeddings to a pickle file
# with open("bert_embeddings.pkl", "wb") as f:
#     pickle.dump(bert_embeddings, f)


In [23]:
with open("bert_embeddings.pkl", "rb") as f:
    bert_embeddings = pickle.load(f)

## TFIDF Vectorizer

In [18]:
texts = df['review'].tolist()
labels = df['sentiment'].tolist() 

# Example: TF-IDF with n-grams
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
X = vectorizer.fit_transform(texts)
y = labels 

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Split data
# X_train, X_test, y_train, y_test = train_test_split(bert_embeddings, labels, test_size=0.2, random_state=42) #BERT Embeddings
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample DataFrame (replace with your actual df)
# df = pd.read_csv("your_data.csv") # Assuming you have a DataFrame

# Example: Review text and sentiment columns
texts = df['review'].tolist()  # Text column for the reviews
labels = df['sentiment'].tolist()  # Sentiment labels (positive, negative)

# TF-IDF Vectorization with n-grams (1-gram and 2-gram)
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
X = vectorizer.fit_transform(texts)  # Convert the texts to a matrix of TF-IDF features
y = labels  # The target labels (sentiment)

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Logistic Regression classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Predict the labels on the test set
y_pred = clf.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Calculate additional metrics for evaluation

# Precision, Recall, F1-Score
precision = precision_score(y_test, y_pred, pos_label='positive')  # Change 'positive' to your label if necessary
recall = recall_score(y_test, y_pred, pos_label='positive')
f1 = f1_score(y_test, y_pred, pos_label='positive')

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Classification Report (includes precision, recall, F1-score, and support)
class_report = classification_report(y_test, y_pred)

# ROC-AUC Score (for binary classification)
roc_auc = roc_auc_score(y_test, y_pred)

# Display the metrics
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print(f"Confusion Matrix: \n{conf_matrix}")
print(f"Classification Report: \n{class_report}")
print(f"ROC-AUC Score: {roc_auc}")


Accuracy: 89.18%
Precision: 0.8786602870813397
Recall: 0.9110934709267712
F1-Score: 0.8945830085736555
Confusion Matrix: 
[[4327  634]
 [ 448 4591]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.91      0.87      0.89      4961
           1       0.88      0.91      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

ROC-AUC Score: 0.8916483278842684


In [21]:
import joblib

# Save the trained model and vectorizer to disk
joblib.dump(clf, 'sentiment_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']

In [None]:
# Streamlit UI
st.title("Sentiment Analysis with BERT")
st.write("Enter a movie review or a piece of text, and the model will predict whether it's positive or negative.")

# Text input from the user
user_input = st.text_area("Enter Text", "Type your review here...")

# When the user clicks the 'Analyze' button
if st.button("Analyze"):
    if user_input:
        sentiment, confidence = predict_sentiment(user_input)
        st.write(f"Sentiment: {sentiment}")
        st.write(f"Confidence: {confidence:.2f}")
    else:
        st.write("Please enter some text for analysis.")