In [1]:
import numpy as np
import pandas as pd

In [2]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [3]:
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m62.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━

In [4]:
import gensim

#### Dataset Link: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [6]:
# Importing dataset
messages = pd.read_csv("IMDB Dataset.csv")

In [7]:
messages.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [8]:
# Function for text preprocessing
def preprocess_text(msg):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    msg = msg.lower()
    msg = re.sub('[^a-z\s]', '', msg)
    # Tokenization
    tokens = word_tokenize(msg)
    # Removing stopwords
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatized = [lemmatizer.lemmatize(word) for word in tokens]

    return lemmatized

In [9]:
# Creating the corpus
corpus = messages['review'].apply(preprocess_text).to_list()

In [10]:
# Training a Word2Vec model
model = gensim.models.Word2Vec(
    corpus,
    vector_size=100,
    window=5,
    min_count=2,
    sg=1
)

In [11]:
# Function for AvgWord2Vec
def avg_word2vec(doc):
    words = [model.wv[word] for word in doc if word in model.wv.index_to_key]
    # words is a list of row vectors
    if len(words) == 0:
        return np.zeros(model.vector_size)
    return np.mean(words, axis=0) # axis=0 gives us column wise mean

In [12]:
X = np.array([avg_word2vec(doc) for doc in corpus])

In [30]:
y = np.array(pd.get_dummies(messages['sentiment'], drop_first=True, dtype=int))

In [31]:
# Train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)

### Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegressionCV

In [15]:
# Model
lr = LogisticRegressionCV(
    cv=5,
    scoring='accuracy',
    random_state=22,
    max_iter=1000
)

In [16]:
# Fitting
lr.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [17]:
# Prediction
y_pred = lr.predict(X_test)

In [18]:
from sklearn.metrics import accuracy_score, classification_report

In [19]:
# Accuracy Score
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.8791


In [20]:
# Classification Report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.87      0.88      4987
           1       0.88      0.88      0.88      5013

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



### SVM

In [67]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [71]:
svc=SVC()

param_grid = [{'kernel': ['linear', 'rbf']},
              {'kernel': ['poly'], 'degree': [2, 3, 4]}]

grid_search=GridSearchCV(estimator=svc,param_grid=param_grid,cv=5,scoring='accuracy')

In [72]:
grid_search.fit(X_train, y_train.ravel())

In [73]:
best_model = grid_search.best_estimator_

In [77]:
grid_search.best_params_

{'degree': 4, 'kernel': 'poly'}

In [74]:
#Prediction
y_pred = best_model.predict(X_test)

In [75]:
# Accuracy Score
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.8833


In [76]:
# Classification Report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.87      0.88      4987
           1       0.88      0.89      0.88      5013

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



### Random Forest

In [41]:
from sklearn.ensemble import RandomForestClassifier

In [48]:
rf = RandomForestClassifier(
    n_estimators=150,
    max_depth=10,
    max_features='sqrt',
    random_state=22,
    n_jobs=-1
)

In [49]:
rf.fit(X_train, y_train.ravel())

In [51]:
# Prediction
y_pred = rf.predict(X_test)

In [52]:
# Accuracy Score
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.836


In [68]:
# Classification Report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.86      0.87      4987
           1       0.87      0.87      0.87      5013

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



### XGBoost

In [57]:
from xgboost import XGBClassifier

In [61]:
xgb = XGBClassifier(
    n_estimators=150,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss',
    random_state=22,
    n_jobs=-1
)

In [62]:
xgb.fit(X_train, y_train.ravel())

In [63]:
# Prediction
y_pred = xgb.predict(X_test)

In [64]:
# Accuracy Score
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.8691


In [65]:
# Classification Report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.86      0.87      4987
           1       0.87      0.87      0.87      5013

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



### ANN

In [21]:
import tensorflow as tf
print(tf.__version__)

2.18.0


In [22]:
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Sequential

In [23]:
classifier = Sequential()

classifier.add(Input(shape=(100,)))
classifier.add(Dense(units=128, activation='relu'))
classifier.add(Dropout(0.3))
classifier.add(Dense(units=64, activation='relu'))
classifier.add(Dropout(0.3))
classifier.add(Dense(units=1, activation='sigmoid'))

classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [24]:
model_history = classifier.fit(
    X_train, y_train,
    epochs=20,
    batch_size=16,
    validation_data=(X_test, y_test)
)

Epoch 1/20
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 4ms/step - accuracy: 0.7811 - loss: 0.4525 - val_accuracy: 0.8708 - val_loss: 0.3106
Epoch 2/20
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.8631 - loss: 0.3299 - val_accuracy: 0.8755 - val_loss: 0.2989
Epoch 3/20
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.8621 - loss: 0.3254 - val_accuracy: 0.8761 - val_loss: 0.3028
Epoch 4/20
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.8646 - loss: 0.3181 - val_accuracy: 0.8767 - val_loss: 0.2969
Epoch 5/20
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - accuracy: 0.8679 - loss: 0.3161 - val_accuracy: 0.8745 - val_loss: 0.3006
Epoch 6/20
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.8683 - loss: 0.3148 - val_accuracy: 0.8702 - val_loss: 0.3068
Epoch 7/20


In [25]:
loss, accuracy = classifier.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8799 - loss: 0.2930
Test Accuracy: 0.8776999711990356
