In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
import matplotlib.pyplot as plt

In [4]:
import seaborn as sns

DATASET IMPORT

In [5]:
df = pd.read_excel(r"C:\Users\701540\CONDA\Sentiment_Analysis\customer_review.xlsx")

In [6]:
df.sample(5)

Unnamed: 0,Reviews,sentiment
213,Big shop......\nMost of the products are low p...,NEGATIVE
463,3 Pcs set Sarees Single Bill Arrange.,NEGATIVE
728,sarees collection very nice,POSITIVE
84,Poor quality clothing and high price. Staff sh...,NEGATIVE
1249,Excellent service nice collections,POSITIVE


## DATA PRE PROCESSING

In [7]:
pip install scikit-learn





[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: C:\Users\701540\AppData\Local\Programs\Python\Python312\python.exe -m pip install --upgrade pip


In [8]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

## STEP 1 : CLEAN THE TEXT(remove punctuations, numbers, convert to lowercase)

In [9]:
def clean_text(review):
    review = re.sub(r"\d+", "", review)   #remove digits
    review = re.sub(r"[^\w\s]", "", review)    #remove punctuation
    review = review.lower()
    return review

In [11]:
df["cleaned"] = df["Reviews"].apply(lambda x: clean_text(str(x)))

In [13]:
df.sample(5)

Unnamed: 0,Reviews,sentiment,cleaned
844,Nice. 39935 Nice Service.Keep it up,POSITIVE,nice nice servicekeep it up
344,Lot of collections...... enjoy your shopping @...,POSITIVE,lot of collections enjoy your shopping one stop
1385,There Is Only KKV Dhoties Sets Available. Here...,NEGATIVE,there is only kkv dhoties sets available here ...
1210,We got above 3500/- rupees.But they didn't pro...,NEGATIVE,we got above rupeesbut they didnt provide ext...
1182,I am very disappointing with chennai silks rew...,NEGATIVE,i am very disappointing with chennai silks rew...


## STEP 2 : REMOVE STOPWORDS & TOKENIZE

In [14]:
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\701540\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [15]:
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\701540\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [16]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\701540\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
stop_words = set(stopwords.words('english')) - {"not","never","no"}

In [18]:
def remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return " ".join(filtered_tokens)

In [19]:
df["cleaned"] = df["cleaned"].apply(remove_stopwords)

In [20]:
df.sample(2)

Unnamed: 0,Reviews,sentiment,cleaned
672,Shirt New Collection are not available.,NEGATIVE,shirt new collection not available
218,"They don’t have much corrections for wedding, ...",NEGATIVE,dont much corrections wedding sarees focus muc...


## FEATURE EXTRACTION

## Word embedding using BERT

In [21]:
from transformers import BertTokenizer, BertModel
import torch

In [22]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")

In [23]:
def encode_rev(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)

In [24]:
df["bert_embed"] = df["cleaned"].apply(lambda x: encode_rev(x))

In [25]:
df.sample(1)

Unnamed: 0,Reviews,sentiment,cleaned,bert_embed
1058,NOTHING ELSE . GOOD SERVICE MAHESHWARI,POSITIVE,nothing else good service maheshwari,"[[tensor(0.2156), tensor(-0.0568), tensor(-0.0..."


# MODEL BUILDING

## LOGISTIC REGRESSION

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [27]:
from sklearn.model_selection import train_test_split

In [28]:
# Flatten each (1, 768) embedding to (768,) before creating the DataFrame
bert_embeddings = pd.DataFrame(df["bert_embed"].apply(lambda x: x.flatten()).tolist(), index=df.index)

# Assign bert_embeddings as X if it's the only feature
X = bert_embeddings


In [29]:
X.sample(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
1052,tensor(-0.0064),tensor(-0.0160),tensor(0.1824),tensor(0.2531),tensor(0.0300),tensor(-0.0443),tensor(0.0737),tensor(0.4724),tensor(0.0788),tensor(-0.4352),...,tensor(0.0193),tensor(-0.5650),tensor(-0.2869),tensor(-0.1497),tensor(-0.1012),tensor(0.0140),tensor(-0.2363),tensor(0.0124),tensor(-0.1397),tensor(-0.2501)
1403,tensor(0.1055),tensor(-0.2436),tensor(-0.0967),tensor(0.3602),tensor(0.3086),tensor(-0.4889),tensor(0.3541),tensor(0.3587),tensor(-0.3540),tensor(-0.4377),...,tensor(0.0423),tensor(0.1146),tensor(-0.2668),tensor(0.1562),tensor(0.3495),tensor(-0.4675),tensor(-0.4290),tensor(0.1657),tensor(-0.0946),tensor(-0.0372)


In [31]:
y = df['sentiment']

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
model = LogisticRegression()

In [34]:
model.fit(X_train, y_train)

In [35]:
y_pred = model.predict(X_test)

In [36]:
print("Accuracy", accuracy_score(y_test, y_pred))

Accuracy 0.8707482993197279


In [37]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    NEGATIVE       0.86      0.80      0.83       114
    POSITIVE       0.88      0.92      0.90       180

    accuracy                           0.87       294
   macro avg       0.87      0.86      0.86       294
weighted avg       0.87      0.87      0.87       294



In [38]:
import warnings
warnings.filterwarnings("ignore")

## RANDOM FOREST CLASSIFIER

In [39]:
from sklearn.ensemble import RandomForestClassifier


In [40]:
RF_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [41]:
RF_model.fit(X_train,y_train)

In [42]:
y_pred_RF = RF_model.predict(X_test)

In [43]:
print("Accuracy", accuracy_score(y_test, y_pred_RF))

Accuracy 0.8435374149659864


In [44]:
print(classification_report(y_test, y_pred_RF))

              precision    recall  f1-score   support

    NEGATIVE       0.85      0.73      0.78       114
    POSITIVE       0.84      0.92      0.88       180

    accuracy                           0.84       294
   macro avg       0.84      0.82      0.83       294
weighted avg       0.84      0.84      0.84       294



## SUPPORT VECTOR MACHINE

In [45]:
from sklearn.svm import SVC

In [46]:
SVM = SVC()

In [47]:
SVM.fit(X_train, y_train)

In [48]:
y_pred_S = SVM.predict(X_test)

In [49]:
print("Accuracy", accuracy_score(y_test, y_pred_S))

Accuracy 0.8979591836734694


In [50]:
print(classification_report(y_test, y_pred_S))

              precision    recall  f1-score   support

    NEGATIVE       0.89      0.84      0.86       114
    POSITIVE       0.90      0.93      0.92       180

    accuracy                           0.90       294
   macro avg       0.90      0.89      0.89       294
weighted avg       0.90      0.90      0.90       294



## APPLYING SMOTE

In [51]:
from imblearn.over_sampling import SMOTE

In [52]:
smote = SMOTE(random_state=42)

In [53]:
X_train_r, y_train_r = smote.fit_resample(X_train, y_train)

In [54]:
RF_model.fit(X_train_r, y_train_r)

In [55]:
y_pred_smote = RF_model.predict(X_test)

In [56]:
print("Accuracy", accuracy_score(y_test, y_pred_smote))

Accuracy 0.8129251700680272


In [57]:
print(classification_report(y_test, y_pred_smote))

              precision    recall  f1-score   support

    NEGATIVE       0.75      0.77      0.76       114
    POSITIVE       0.85      0.84      0.85       180

    accuracy                           0.81       294
   macro avg       0.80      0.81      0.80       294
weighted avg       0.81      0.81      0.81       294



In [58]:
model.fit(X_train_r, y_train_r)

In [59]:
y_pred_slr = model.predict(X_test)

In [60]:
print("Accuracy", accuracy_score(y_test, y_pred_slr))

Accuracy 0.8707482993197279


In [61]:
print(classification_report(y_test, y_pred_slr))

              precision    recall  f1-score   support

    NEGATIVE       0.83      0.84      0.83       114
    POSITIVE       0.90      0.89      0.89       180

    accuracy                           0.87       294
   macro avg       0.86      0.87      0.86       294
weighted avg       0.87      0.87      0.87       294



In [62]:
SVM.fit(X_train_r, y_train_r)

In [63]:
y_pred_S_smote = SVM.predict(X_test)

In [64]:
print("Accuracy", accuracy_score(y_test, y_pred_S_smote))

Accuracy 0.8775510204081632


In [65]:
print(classification_report(y_test, y_pred_S_smote))

              precision    recall  f1-score   support

    NEGATIVE       0.82      0.87      0.85       114
    POSITIVE       0.91      0.88      0.90       180

    accuracy                           0.88       294
   macro avg       0.87      0.88      0.87       294
weighted avg       0.88      0.88      0.88       294



## UNDER SAMPLING

In [66]:
from imblearn.under_sampling import RandomUnderSampler

In [67]:
undersampler = RandomUnderSampler()

In [68]:
X_us, y_us = undersampler.fit_resample(X,y)

In [69]:
X_train_n, X_test_n, y_train_n, y_test_n = train_test_split(X_us, y_us, test_size=0.20, random_state=42)

In [70]:
RF_model.fit(X_train_n, y_train_n)

In [71]:
y_pred_u_rf = RF_model.predict(X_test_n)

In [72]:
print("Accuracy", accuracy_score(y_test_n, y_pred_u_rf))

Accuracy 0.8632075471698113


In [73]:
print(classification_report(y_test_n, y_pred_u_rf))

              precision    recall  f1-score   support

    NEGATIVE       0.82      0.95      0.88       114
    POSITIVE       0.93      0.77      0.84        98

    accuracy                           0.86       212
   macro avg       0.88      0.86      0.86       212
weighted avg       0.87      0.86      0.86       212



## LOGISTIC REGRESSION AFTER UNDER SAMPLING

In [74]:
model.fit(X_train_n, y_train_n)

In [75]:
y_pred_lr_ur = model.predict(X_test_n)

In [76]:
print("Accuracy", accuracy_score(y_test_n, y_pred_lr_ur))

Accuracy 0.8537735849056604


In [77]:
print(classification_report(y_test_n, y_pred_lr_ur))

              precision    recall  f1-score   support

    NEGATIVE       0.84      0.90      0.87       114
    POSITIVE       0.88      0.80      0.83        98

    accuracy                           0.85       212
   macro avg       0.86      0.85      0.85       212
weighted avg       0.86      0.85      0.85       212



In [78]:
SVM.fit(X_train_n, y_train_n)

In [79]:
y_pred_s_ur = SVM.predict(X_test_n)

In [80]:
print("Accuracy", accuracy_score(y_test_n, y_pred_s_ur))

Accuracy 0.8679245283018868


In [81]:
print(classification_report(y_test_n, y_pred_s_ur))

              precision    recall  f1-score   support

    NEGATIVE       0.83      0.95      0.89       114
    POSITIVE       0.93      0.78      0.84        98

    accuracy                           0.87       212
   macro avg       0.88      0.86      0.86       212
weighted avg       0.88      0.87      0.87       212



## HYPER PARAMETER TUNING FOR SVM

In [82]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [83]:
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", SVC())
])

In [84]:
param_grid = {
    "svm__C": [0.1,1,10,100],
    "svm__gamma": ["scale","auto"] + [0.01, 0.1,1],
    "svm__kernel": ["linear", "rbf", "poly"]
}

In [85]:
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring="f1")

In [86]:
grid_search.fit(X_train_n, y_train_n)

In [87]:
y_pred_gs = grid_search.predict(X_test_n)

In [88]:
print("Accuracy", accuracy_score(y_test_n, y_pred_gs))

Accuracy 0.8443396226415094


In [89]:
print(classification_report(y_test_n, y_pred_gs))

              precision    recall  f1-score   support

    NEGATIVE       0.85      0.86      0.86       114
    POSITIVE       0.84      0.83      0.83        98

    accuracy                           0.84       212
   macro avg       0.84      0.84      0.84       212
weighted avg       0.84      0.84      0.84       212



In [90]:
grid_search.fit(X_train, y_train)

In [91]:
y_pred_gn = grid_search.predict(X_test)

In [92]:
print("Accuracy", accuracy_score(y_test, y_pred_gn))

Accuracy 0.8299319727891157


In [93]:
print(classification_report(y_test, y_pred_gn))

              precision    recall  f1-score   support

    NEGATIVE       0.79      0.77      0.78       114
    POSITIVE       0.86      0.87      0.86       180

    accuracy                           0.83       294
   macro avg       0.82      0.82      0.82       294
weighted avg       0.83      0.83      0.83       294



## RANDOM UNDERSAMPLING, SMOTE, GRID searchCV ALONG WITH SVM MODEL

In [94]:
X_train_ns, y_train_ns = smote.fit_resample(X_train_n, y_train_n)

In [95]:
grid_search.fit(X_train_ns, y_train_ns)

In [96]:
y_pred_ns = grid_search.predict(X_test)

In [97]:
print("Accuracy", accuracy_score(y_test, y_pred_ns))

Accuracy 0.9319727891156463


In [98]:
print(classification_report(y_test, y_pred_ns))

              precision    recall  f1-score   support

    NEGATIVE       0.87      0.97      0.92       114
    POSITIVE       0.98      0.91      0.94       180

    accuracy                           0.93       294
   macro avg       0.92      0.94      0.93       294
weighted avg       0.94      0.93      0.93       294



##  RANDOM UNDERSAMPLING, SMOTE ALONG WITH LOGISTIC REGRESSION MODEL

In [99]:
model.fit(X_train_ns, y_train_ns)

In [100]:
y_pred_Lns = model.predict(X_test)

In [101]:
print("Accuracy", accuracy_score(y_test, y_pred_Lns))

Accuracy 0.9421768707482994


In [102]:
print(classification_report(y_test, y_pred_Lns))

              precision    recall  f1-score   support

    NEGATIVE       0.89      0.97      0.93       114
    POSITIVE       0.98      0.92      0.95       180

    accuracy                           0.94       294
   macro avg       0.94      0.95      0.94       294
weighted avg       0.95      0.94      0.94       294



##  RANDOM UNDERSAMPLING, SMOTE ALONG WITH RANDOM FOREST MODEL

In [103]:
RF_model.fit(X_train_ns, y_train_ns)

In [104]:
y_pred_rns = RF_model.predict(X_test)

In [105]:
print("Accuracy", accuracy_score(y_test, y_pred_rns))

Accuracy 0.9183673469387755


In [106]:
print(classification_report(y_test, y_pred_rns))

              precision    recall  f1-score   support

    NEGATIVE       0.84      0.98      0.90       114
    POSITIVE       0.99      0.88      0.93       180

    accuracy                           0.92       294
   macro avg       0.91      0.93      0.92       294
weighted avg       0.93      0.92      0.92       294

