In [1]:
#Import Libraries
import pandas as pd
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load Dataset
# Dataset should contain two columns:
# - review(text)
# - sentiment(positive/negative or 1/0)

data=pd.read_csv("customer_reviews.csv")

df = pd.DataFrame(data)
df.head()

# Text Preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip()
    return text

df['clean_review'] = df['review'].apply(clean_text)
df.head()

# Labelling Sentiment
def label_sentiment(rating):
    if rating >= 4:
        return 1      # Positive
    elif rating <= 2:
        return 0      # Negative
    else:
        return None   # Neutral

df['sentiment'] = df['rating'].apply(lambda x: 1 if x >= 4 else 0)
df = df.dropna(subset=['sentiment'])

# Train Test Split
X = df['clean_review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=42
)

# TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

#Logistic Regression Model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Model Evaluation
y_pred = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         3

    accuracy                           1.00         4
   macro avg       1.00      1.00      1.00         4
weighted avg       1.00      1.00      1.00         4


Confusion Matrix:
 [[1 0]
 [0 3]]


## Model Evaluation Analysis

The model achieved 100% accuracy on the test set. However, this result is influenced by the small size and class imbalance of the dataset after sentiment labeling. The test set contained only positive sentiment samples, leading to perfect classification. While the model performs correctly on the available data, larger and more balanced datasets are required for robust performance evaluation.