In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
%matplotlib inline

In [53]:
#Read the dataset 
df = pd.read_csv('D:\\Git\\LLM-Sentiment-Analysis\\data\\reviews.csv')
df.head()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,sortOrder,appId
0,gp:AOqpTOEhZuqSqqWnaKRgv-9ABYdajFUB0WugPGh-SG-...,Eric Tie,https://play-lh.googleusercontent.com/a-/AOh14...,I cannot open the app anymore,1,0,5.4.0.6,2020-10-27 21:24:41,,,newest,com.anydo
1,gp:AOqpTOH0WP4IQKBZ2LrdNmFy_YmpPCVrV3diEU9KGm3...,john alpha,https://play-lh.googleusercontent.com/a-/AOh14...,I have been begging for a refund from this app...,1,0,,2020-10-27 14:03:28,"Please note that from checking our records, yo...",2020-10-27 15:05:52,newest,com.anydo
2,gp:AOqpTOEMCkJB8Iq1p-r9dPwnSYadA5BkPWTf32Z1azu...,Sudhakar .S,https://play-lh.googleusercontent.com/a-/AOh14...,Very costly for the premium version (approx In...,1,0,,2020-10-27 08:18:40,,,newest,com.anydo
3,gp:AOqpTOGFrUWuKGycpje8kszj3uwHN6tU_fd4gLVFy9z...,SKGflorida@bellsouth.net DAVID S,https://play-lh.googleusercontent.com/-75aK0WF...,"Used to keep me organized, but all the 2020 UP...",1,0,,2020-10-26 13:28:07,What do you find troublesome about the update?...,2020-10-26 14:58:29,newest,com.anydo
4,gp:AOqpTOHls7DW8wmDFzTkHwxuqFkdNQtKHmO6Pt9jhZE...,Louann Stoker,https://play-lh.googleusercontent.com/-pBcY_Z-...,Dan Birthday Oct 28,1,0,5.6.0.7,2020-10-26 06:10:50,,,newest,com.anydo


In [54]:
# Step 1: Drop irrelevant columns
df_clean = df[['content', 'score']].dropna()

# Step 2: Create sentiment labels
def label_sentiment(score):
    if score <= 2:
        return 'negative'
    elif score == 3:
        return 'neutral'
    else:
        return 'positive'  

df_clean['sentiment'] = df_clean['score'].apply(label_sentiment)
df = pd.DataFrame(data=df_clean,columns=['content','sentiment'])
df

Unnamed: 0,content,sentiment
0,I cannot open the app anymore,negative
1,I have been begging for a refund from this app...,negative
2,Very costly for the premium version (approx In...,negative
3,"Used to keep me organized, but all the 2020 UP...",negative
4,Dan Birthday Oct 28,negative
...,...,...
12490,"I really like the planner, it helps me achieve...",positive
12491,😁****😁,positive
12492,Very useful apps. You must try it,positive
12493,Would pay for this if there were even more add...,positive


In [None]:
# Step 3: Remove punctualtion
string.punctuation
df['nopunc'] = df['content'].apply(lambda x: [a for a in x if a not in string.punctuation])
df['nopunc'] = df['nopunc'].apply(''.join)
df

Unnamed: 0,content,sentiment,nopunc
0,I cannot open the app anymore,negative,I cannot open the app anymore
1,I have been begging for a refund from this app...,negative,I have been begging for a refund from this app...
2,Very costly for the premium version (approx In...,negative,Very costly for the premium version approx Ind...
3,"Used to keep me organized, but all the 2020 UP...",negative,Used to keep me organized but all the 2020 UPD...
4,Dan Birthday Oct 28,negative,Dan Birthday Oct 28
...,...,...,...
12490,"I really like the planner, it helps me achieve...",positive,I really like the planner it helps me achieve ...
12491,😁****😁,positive,😁😁
12492,Very useful apps. You must try it,positive,Very useful apps You must try it
12493,Would pay for this if there were even more add...,positive,Would pay for this if there were even more add...


In [None]:
# Step 4: Vectorize text
X=df['nopunc']
y=df['sentiment']
cv = CountVectorizer()
X= cv.fit_transform(X)

In [57]:
#Step 5: Train test split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=101)

### Naive Bayes

In [58]:
# Step 6: Training models 
models = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Linear SVM': LinearSVC()
}

# Step 7: Evaluation
for name, model in models.items():
    print(f"\n====== {name} ======")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred))


Accuracy: 0.7171
              precision    recall  f1-score   support

    negative       0.69      0.84      0.76       957
     neutral       0.27      0.06      0.10       409
    positive       0.78      0.85      0.81      1133

    accuracy                           0.72      2499
   macro avg       0.58      0.58      0.56      2499
weighted avg       0.66      0.72      0.67      2499


Accuracy: 0.7071
              precision    recall  f1-score   support

    negative       0.72      0.76      0.74       957
     neutral       0.32      0.19      0.23       409
    positive       0.77      0.85      0.81      1133

    accuracy                           0.71      2499
   macro avg       0.60      0.60      0.59      2499
weighted avg       0.68      0.71      0.69      2499


Accuracy: 0.6855
              precision    recall  f1-score   support

    negative       0.71      0.72      0.71       957
     neutral       0.28      0.18      0.22       409
    positive       0.

