In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

In [None]:
reviews_df = pd.read_json("mini-appliances.jsonl", lines=True)
print(reviews_df.head())

In [None]:
# Classify sentiment based on rating
conditions = [
    (reviews_df['rating'] <= 2),
    (reviews_df['rating'] == 3),
    (reviews_df['rating'] >= 4)
]

sentiment = [
    'negative',
    'neutral',
    'positive'
]

reviews_df['sentiment_label'] = np.select(conditions, sentiment, default='unknown')

In [None]:
# Visualize sentiment distribution
reviews_df['sentiment_label'].value_counts().plot(kind='bar')
plt.title("Sentiment Distribution")
plt.xlabel("Sentiment")
plt.ylabel("Number of Reviews")
plt.show()

In [None]:
# Preprocess text 

# Removing stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

reviews_df['title'] = reviews_df['title'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
reviews_df['text'] = reviews_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Perform lemmatization
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

reviews_df['title'] = reviews_df['title'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
reviews_df['text'] = reviews_df['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

# Convert to lowercase and remove punctuation
punctuation_pattern = r'[^a-zA-Z0-9\s]'

reviews_df['title'] = reviews_df['title'].str.lower().replace(punctuation_pattern, '', regex=True)
reviews_df['text'] = reviews_df['text'].str.lower().replace(punctuation_pattern, '', regex=True)

# Combine title and text
reviews_df['cleaned_text'] = reviews_df['title'] + ' ' + reviews_df['text']
print(reviews_df[['cleaned_text', 'sentiment_label']].head())

In [None]:
# Vectorize text using TF-IDF
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=0.01)
X = vectorizer.fit_transform(reviews_df['cleaned_text'])
print(vectorizer.get_feature_names_out()[:20])

y = reviews_df['sentiment_label']

In [None]:
print(f"Original distribution: \n{y.value_counts()}")

# Undersampling majority classes
rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(X, y)
print(f"\nDistribution after undersampling: \n{y_res.value_counts()}")

In [None]:
# Split data into training and testing sets
X_trn, X_tst, y_trn, y_tst = train_test_split(X_res, y_res, test_size=0.2, random_state=42, stratify=y_res)

# Tune the Naive Bayes classifier with GridSearch
nb = MultinomialNB()
param_grid = {'alpha': [0.1, 1, 10]}
grid_search = GridSearchCV(nb, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_trn, y_trn)

# Test best model
best_nb = grid_search.best_estimator_
y_pred = best_nb.predict(X_tst)

accuracy = np.mean(y_pred == y_tst)

# Display results
print(f"Accuracy: {accuracy}")

print(f"Classification Report: \n{classification_report(y_tst, y_pred)}")

cm = confusion_matrix(y_tst, y_pred, labels=sentiment)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=sentiment)
disp.plot()
plt.show()