In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
import matplotlib.pyplot as plt

In [4]:
import seaborn as sns

DATASET IMPORT

In [5]:
df = pd.read_excel(r"C:\Users\701540\CONDA\Sentiment_Analysis\sentiment.xlsx")

In [6]:
df.sample(5)

Unnamed: 0,REVIEW,sentiment,sentiment_label
67,Easy to find near to the main road. Good colle...,0.4666667,POSITIVE
662,IT WAS A NICE EXPERIENCE GOOD SERVICE AND EXCE...,0.64,POSITIVE
78,"Had to exchange an item, had to wait an hour f...",-1.850372e-17,NEGATIVE
32,The shop is just another version of its Chenna...,-0.09880952,NEGATIVE
673,GOOD. I LOVE CHENNAI SILKS.,0.6,POSITIVE


## DATA PRE PROCESSING

In [7]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [8]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

## STEP 1 : CLEAN THE TEXT(remove punctuations, numbers, convert to lowercase)

In [9]:
def clean_text(review):
    review = re.sub(r"\d+", "", review)   #remove digits
    review = re.sub(r"[^\w\s]", "", review)    #remove punctuation
    review = review.lower()
    return review

In [10]:
df["cleaned"] = df["REVIEW"].apply(lambda x: clean_text(str(x)))

In [11]:
df.sample(2)

Unnamed: 0,REVIEW,sentiment,sentiment_label,cleaned
986,HOTEL FEZILITY MUST NEED.,0.0,NEUTRAL,hotel fezility must need
670,OPEN IN MALAYSIA,0.0,NEUTRAL,open in malaysia


## STEP 2 : REMOVE STOPWORDS & TOKENIZE

In [12]:
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\701540\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\701540\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [14]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\701540\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
stop_words = set(stopwords.words('english'))

In [16]:
def remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return " ".join(filtered_tokens)

In [17]:
df["cleaned"] = df["cleaned"].apply(remove_stopwords)

In [18]:
df.sample(2)

Unnamed: 0,REVIEW,sentiment,sentiment_label,cleaned
704,VERY GOOD SERVICE TO ALL FOR US KEEP IT UP RAT...,0.535,POSITIVE,good service us keep rate high add food court
614,34728 sales Trainee Service is like and Satisf...,0.55,POSITIVE,sales trainee service like satisfied covid che...


## FEATURE EXTRACTION

## Word embedding using BERT

In [19]:
from transformers import BertTokenizer, BertModel
import torch

In [20]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")

In [21]:
def encode_rev(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)

In [22]:
df["bert_embed"] = df["cleaned"].apply(lambda x: encode_rev(x))

In [23]:
df.sample(1)

Unnamed: 0,REVIEW,sentiment,sentiment_label,cleaned,bert_embed
1153,I Come Here To Purchase a Gift For Milk Warmin...,0.120238,POSITIVE,come purchase gift milk warming function colle...,"[[tensor(0.0256), tensor(0.0455), tensor(0.436..."


# MODEL BUILDING

## LOGISTIC REGRESSION

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
# Flatten each (1, 768) embedding to (768,) before creating the DataFrame
bert_embeddings = pd.DataFrame(df["bert_embed"].apply(lambda x: x.flatten()).tolist(), index=df.index)

# Assign bert_embeddings as X if it's the only feature
X = bert_embeddings


In [103]:
X.sample(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
996,tensor(0.0479),tensor(0.0156),tensor(-0.0145),tensor(0.0734),tensor(-0.2186),tensor(0.0141),tensor(0.1671),tensor(-0.1993),tensor(0.1762),tensor(0.0640),...,tensor(0.3455),tensor(-0.0879),tensor(-0.0628),tensor(-0.3422),tensor(0.0442),tensor(-0.3490),tensor(-0.2997),tensor(-0.1086),tensor(-0.2515),tensor(-0.1298)
696,tensor(0.2322),tensor(-0.2011),tensor(0.2266),tensor(-0.0981),tensor(0.1656),tensor(0.1647),tensor(0.2896),tensor(0.1052),tensor(-0.0664),tensor(0.0485),...,tensor(-0.0981),tensor(-0.0680),tensor(-0.1050),tensor(-0.4215),tensor(-0.0816),tensor(-0.1292),tensor(-0.0594),tensor(-0.0944),tensor(-0.5351),tensor(-0.3074)


In [27]:
y = df['sentiment_label']

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
model = LogisticRegression()

In [30]:
model.fit(X_train, y_train)

In [31]:
y_pred = model.predict(X_test)

In [32]:
print("Accuracy", accuracy_score(y_test, y_pred))

Accuracy 0.7984189723320159


In [33]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    NEGATIVE       0.67      0.57      0.62        35
     NEUTRAL       0.58      0.50      0.54        38
    POSITIVE       0.86      0.91      0.88       180

    accuracy                           0.80       253
   macro avg       0.70      0.66      0.68       253
weighted avg       0.79      0.80      0.79       253



In [34]:
import warnings
warnings.filterwarnings("ignore")

## RANDOM FOREST CLASSIFIER

In [35]:
from sklearn.ensemble import RandomForestClassifier


In [36]:
RF_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [37]:
RF_model.fit(X_train,y_train)

In [38]:
y_pred_RF = RF_model.predict(X_test)

In [39]:
print("Accuracy", accuracy_score(y_test, y_pred_RF))

Accuracy 0.7351778656126482


In [40]:
print(classification_report(y_test, y_pred_RF))

              precision    recall  f1-score   support

    NEGATIVE       0.50      0.03      0.05        35
     NEUTRAL       0.70      0.18      0.29        38
    POSITIVE       0.74      0.99      0.85       180

    accuracy                           0.74       253
   macro avg       0.65      0.40      0.40       253
weighted avg       0.70      0.74      0.65       253



## SUPPORT VECTOR MACHINE

In [41]:
from sklearn.svm import SVC

In [42]:
SVM = SVC()

In [43]:
SVM.fit(X_train, y_train)

In [44]:
y_pred_S = SVM.predict(X_test)

In [45]:
print("Accuracy", accuracy_score(y_test, y_pred_S))

Accuracy 0.7944664031620553


In [46]:
print(classification_report(y_test, y_pred_S))

              precision    recall  f1-score   support

    NEGATIVE       1.00      0.23      0.37        35
     NEUTRAL       0.88      0.37      0.52        38
    POSITIVE       0.78      0.99      0.88       180

    accuracy                           0.79       253
   macro avg       0.89      0.53      0.59       253
weighted avg       0.83      0.79      0.75       253



## APPLYING SMOTE

In [47]:
from imblearn.over_sampling import SMOTE

In [48]:
smote = SMOTE(random_state=42)

In [49]:
X_train_r, y_train_r = smote.fit_resample(X_train, y_train)

In [50]:
RF_model.fit(X_train_r, y_train_r)

In [51]:
y_pred_smote = RF_model.predict(X_test)

In [52]:
print("Accuracy", accuracy_score(y_test, y_pred_smote))

Accuracy 0.782608695652174


In [53]:
print(classification_report(y_test, y_pred_smote))

              precision    recall  f1-score   support

    NEGATIVE       0.67      0.46      0.54        35
     NEUTRAL       0.55      0.47      0.51        38
    POSITIVE       0.84      0.91      0.87       180

    accuracy                           0.78       253
   macro avg       0.68      0.61      0.64       253
weighted avg       0.77      0.78      0.77       253



In [54]:
model.fit(X_train_r, y_train_r)

In [55]:
y_pred_slr = model.predict(X_test)

In [56]:
print("Accuracy", accuracy_score(y_test, y_pred_slr))

Accuracy 0.782608695652174


In [57]:
print(classification_report(y_test, y_pred_slr))

              precision    recall  f1-score   support

    NEGATIVE       0.57      0.60      0.58        35
     NEUTRAL       0.54      0.55      0.55        38
    POSITIVE       0.88      0.87      0.87       180

    accuracy                           0.78       253
   macro avg       0.66      0.67      0.67       253
weighted avg       0.79      0.78      0.78       253



In [58]:
SVM.fit(X_train_r, y_train_r)

In [59]:
y_pred_S_smote = SVM.predict(X_test)

In [60]:
print("Accuracy", accuracy_score(y_test, y_pred_S_smote))

Accuracy 0.7786561264822134


In [61]:
print(classification_report(y_test, y_pred_S_smote))

              precision    recall  f1-score   support

    NEGATIVE       0.57      0.60      0.58        35
     NEUTRAL       0.52      0.63      0.57        38
    POSITIVE       0.89      0.84      0.87       180

    accuracy                           0.78       253
   macro avg       0.66      0.69      0.67       253
weighted avg       0.79      0.78      0.78       253



In [62]:
from imblearn.under_sampling import RandomUnderSampler

In [63]:
undersampler = RandomUnderSampler()

In [64]:
X_us, y_us = undersampler.fit_resample(X,y)

In [65]:
X_train_n, X_test_n, y_train_n, y_test_n = train_test_split(X_us, y_us, test_size=0.20, random_state=42)

In [66]:
RF_model.fit(X_train_n, y_train_n)

In [67]:
y_pred_u_rf = RF_model.predict(X_test_n)

In [68]:
print("Accuracy", accuracy_score(y_test_n, y_pred_u_rf))

Accuracy 0.6633663366336634


In [69]:
print(classification_report(y_test_n, y_pred_u_rf))

              precision    recall  f1-score   support

    NEGATIVE       0.61      0.74      0.67        34
     NEUTRAL       0.58      0.73      0.65        30
    POSITIVE       0.91      0.54      0.68        37

    accuracy                           0.66       101
   macro avg       0.70      0.67      0.66       101
weighted avg       0.71      0.66      0.66       101



## LOGISTIC REGRESSION AFTER UNDER SAMPLING

In [70]:
model.fit(X_train_n, y_train_n)

In [71]:
y_pred_lr_ur = model.predict(X_test_n)

In [72]:
print("Accuracy", accuracy_score(y_test_n, y_pred_lr_ur))

Accuracy 0.7524752475247525


In [73]:
print(classification_report(y_test_n, y_pred_lr_ur))

              precision    recall  f1-score   support

    NEGATIVE       0.72      0.76      0.74        34
     NEUTRAL       0.68      0.83      0.75        30
    POSITIVE       0.89      0.68      0.77        37

    accuracy                           0.75       101
   macro avg       0.76      0.76      0.75       101
weighted avg       0.77      0.75      0.75       101



In [74]:
SVM.fit(X_train_n, y_train_n)

In [75]:
y_pred_s_ur = SVM.predict(X_test_n)

In [76]:
print("Accuracy", accuracy_score(y_test_n, y_pred_s_ur))

Accuracy 0.7128712871287128


In [77]:
print(classification_report(y_test_n, y_pred_s_ur))

              precision    recall  f1-score   support

    NEGATIVE       0.69      0.65      0.67        34
     NEUTRAL       0.59      0.87      0.70        30
    POSITIVE       0.96      0.65      0.77        37

    accuracy                           0.71       101
   macro avg       0.75      0.72      0.71       101
weighted avg       0.76      0.71      0.72       101



## HYPER PARAMETER TUNING FOR SVM

In [78]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [79]:
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", SVC())
])

In [80]:
param_grid = {
    "svm__C": [0.1,1,10,100],
    "svm__gamma": ["scale","auto"] + [0.01, 0.1,1],
    "svm__kernel": ["linear", "rbf", "poly"]
}

In [81]:
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring="f1")

In [82]:
grid_search.fit(X_train_n, y_train_n)

In [83]:
y_pred_gs = grid_search.predict(X_test_n)

In [84]:
print("Accuracy", accuracy_score(y_test_n, y_pred_gs))

Accuracy 0.7227722772277227


In [85]:
print(classification_report(y_test_n, y_pred_gs))

              precision    recall  f1-score   support

    NEGATIVE       0.72      0.76      0.74        34
     NEUTRAL       0.69      0.73      0.71        30
    POSITIVE       0.76      0.68      0.71        37

    accuracy                           0.72       101
   macro avg       0.72      0.72      0.72       101
weighted avg       0.72      0.72      0.72       101



In [86]:
grid_search.fit(X_train, y_train)

In [87]:
y_pred_gn = grid_search.predict(X_test)

In [88]:
print("Accuracy", accuracy_score(y_test, y_pred_gn))

Accuracy 0.7549407114624506


In [89]:
print(classification_report(y_test, y_pred_gn))

              precision    recall  f1-score   support

    NEGATIVE       0.55      0.63      0.59        35
     NEUTRAL       0.50      0.47      0.49        38
    POSITIVE       0.85      0.84      0.85       180

    accuracy                           0.75       253
   macro avg       0.63      0.65      0.64       253
weighted avg       0.76      0.75      0.76       253



## RANDOM UNDERSAMPLING, SMOTE, GRID searchCV ALONG WITH SVM MODEL

In [90]:
X_train_ns, y_train_ns = smote.fit_resample(X_train_n, y_train_n)

In [91]:
grid_search.fit(X_train_ns, y_train_ns)

In [92]:
y_pred_ns = grid_search.predict(X_test)

In [93]:
print("Accuracy", accuracy_score(y_test, y_pred_ns))

Accuracy 0.7984189723320159


In [94]:
print(classification_report(y_test, y_pred_ns))

              precision    recall  f1-score   support

    NEGATIVE       0.60      1.00      0.75        35
     NEUTRAL       0.57      0.95      0.71        38
    POSITIVE       0.99      0.73      0.84       180

    accuracy                           0.80       253
   macro avg       0.72      0.89      0.77       253
weighted avg       0.88      0.80      0.81       253



##  RANDOM UNDERSAMPLING, SMOTE ALONG WITH LOGISTIC REGRESSION MODEL

In [95]:
model.fit(X_train_ns, y_train_ns)

In [96]:
y_pred_Lns = model.predict(X_test)

In [97]:
print("Accuracy", accuracy_score(y_test, y_pred_Lns))

Accuracy 0.8102766798418972


In [98]:
print(classification_report(y_test, y_pred_Lns))

              precision    recall  f1-score   support

    NEGATIVE       0.66      1.00      0.80        35
     NEUTRAL       0.56      1.00      0.72        38
    POSITIVE       1.00      0.73      0.85       180

    accuracy                           0.81       253
   macro avg       0.74      0.91      0.79       253
weighted avg       0.89      0.81      0.82       253



##  RANDOM UNDERSAMPLING, SMOTE ALONG WITH RANDOM FOREST MODEL

In [99]:
RF_model.fit(X_train_ns, y_train_ns)

In [100]:
y_pred_rns = RF_model.predict(X_test)

In [101]:
print("Accuracy", accuracy_score(y_test, y_pred_rns))

Accuracy 0.758893280632411


In [102]:
print(classification_report(y_test, y_pred_rns))

              precision    recall  f1-score   support

    NEGATIVE       0.60      1.00      0.75        35
     NEUTRAL       0.49      0.95      0.65        38
    POSITIVE       0.99      0.67      0.80       180

    accuracy                           0.76       253
   macro avg       0.70      0.87      0.73       253
weighted avg       0.86      0.76      0.77       253

