In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [11]:
# Load the CSV file
df = pd.read_csv("dataset.csv")

# Check for missing values
df = df.dropna(subset=['Comment', 'Sentiment'])

# Extract features and labels
X = df['Comment']
y = df['Sentiment']  # 0 = Negative, 1 = Neutral, 2 = Positive

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [12]:
# Part 2: Train Logistic Regression and Evaluate

# Initialize Logistic Regression model
lr_model = LogisticRegression(max_iter=1000, solver='lbfgs', multi_class='multinomial')

# Train the model
lr_model.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred = lr_model.predict(X_test_tfidf)

# Evaluate performance
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=["Negative", "Neutral", "Positive"]))

# Print overall accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")




Classification Report:
              precision    recall  f1-score   support

    Negative       0.65      0.55      0.59       381
     Neutral       0.62      0.64      0.63       398
    Positive       0.74      0.80      0.77       614

    accuracy                           0.68      1393
   macro avg       0.67      0.66      0.66      1393
weighted avg       0.68      0.68      0.68      1393

Accuracy: 0.6827
