In [21]:
# Import Libraries
import pandas as pd
import seaborn as sns
import io
import string
import re
import nltk

from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from transformers import BertModel
from torch import nn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support

In [3]:
# Load the CSV file
imdb_data = pd.read_csv("/home/manu/Documents/Data Projects/Text Classification for Sentiment Analysis of Movie Reviews/datasets/IMDB Dataset.csv")

In [4]:
# Display the first 10 rows of the DataFrame
imdb_data[:10]

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [5]:
# check the number of rows and columns in the DataFrame
print(imdb_data.shape)

(50000, 2)


In [6]:
# summary of the dataset
imdb_data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [7]:
# check is data is balanced or not(sentiment count)
imdb_data['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [8]:
# use nltk to download the stop words
nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_review(review):
    # tokenize the review
    words = word_tokenize(review)
    
    # lowercase all words
    words = [word.lower() for word in words]
    
    # remove punctuation
    words = [word for word in words if word.isalpha()]
    
    # remove stop words
    words = [word for word in words if word not in stop_words]
    
    # stem or Lemmatize the words
    words = [stemmer.stem(word) for word in words]
    
    return words

[nltk_data] Downloading package stopwords to /home/manu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/manu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
# define the feature and target columns
X = imdb_data["review"]
y = imdb_data["sentiment"]

# split the dataset into training (70%), validation (50%), and testing (50%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [22]:
# create a CountVectorizer object
count_vectorizer = CountVectorizer()

# fit the CountVectorizer on the training data
X_train_counts = count_vectorizer.fit_transform(X_train)

# transform the validation and test sets
X_val_counts = count_vectorizer.transform(X_val)
X_test_counts = count_vectorizer.transform(X_test)

# # create and train the model
# model = LogisticRegression()
# model.fit(X_train_counts, y_train)

# create models with different settings
models = [
    LogisticRegression(penalty='l2', C=1.0, solver='liblinear'),
    LogisticRegression(penalty='l1', C=1.0, solver='liblinear'),
    LogisticRegression(penalty='elasticnet', C=1.0, l1_ratio=0.5, solver='saga'),
]

# train the models and store the validation accuracy
val_accuracies = []
for model in models:
    model.fit(X_train_counts, y_train)
    y_val_pred = model.predict(X_val_counts)
    val_accuracies.append(accuracy_score(y_val, y_val_pred))
    
# select the best model based on validation accuracy
best_index = val_accuracies.index(max(val_accuracies))
best_model = models[best_index]

# evaluate the best model on the test
X_test_counts = count_vectorizer.transform(X_test)
y_test_pred = best_model.predict(X_test_counts)
test_accuracy = accuracy_score(y_test, y_test_pred) 
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_test_pred, average='weighted')

print("Test accuracy of the best model: {:.2f}%".format(test_accuracy*100))   
print("Precision: {:.2f}%".format(precision*100))
print("Recall: {:.2f}%".format(recall*100))
print("F1-score: {:.2f}%".format(f1_score*100))




Test accuracy of the best model: 89.28%
Precision: 89.29%
Recall: 89.28%
F1-score: 89.28%
