In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight


In [2]:
# Loading the dataset
df = pd.read_csv('Musical_instruments_reviews.csv')

# Data preprocessing
df = df.dropna(subset=['reviewText'])

df.rename(columns={'reviewText': 'review', 'overall': 'rating'}, inplace=True)

In [3]:
# Converting ratings to sentiments
def assign_sentiment(rating):
    if rating >= 4:
        return 'Positive'
    elif rating == 3:
        return 'Neutral'
    else:
        return 'Negative'

df['sentiment'] = df['rating'].apply(assign_sentiment)


In [4]:
# Text preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['review'] = df['review'].apply(clean_text)


In [5]:
# Vectorization with TF-IDF vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=10000)
X = vectorizer.fit_transform(df['review'])
y = df['sentiment']


In [6]:
# now we split data into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

In [7]:
# Train a Logistic Regression model with class weights
model = LogisticRegression(max_iter=500, C=2.0, class_weight=class_weight_dict)
model.fit(X_train, y_train)

In [8]:
#lets make predictions using test dataset
y_pred = model.predict(X_test)


In [9]:
# Display results
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", report)

Accuracy: 82.40%
Classification Report:
               precision    recall  f1-score   support

    Negative       0.47      0.39      0.43       107
     Neutral       0.18      0.34      0.24       133
    Positive       0.93      0.89      0.91      1811

    accuracy                           0.82      2051
   macro avg       0.53      0.54      0.52      2051
weighted avg       0.86      0.82      0.84      2051



In [11]:
# Function for user input prediction
def predict_sentiment():
    review = input("Enter a review: ")
    review = clean_text(review)
    review_vec = vectorizer.transform([review])
    prediction = model.predict(review_vec)
    print(f"Sentiment: {prediction[0]}")

predict_sentiment()


Enter a review:  this product is the best


Sentiment: Positive
