In [1]:
# sentiment_analysis.py

import pandas as pd
import numpy as np
import re
import string
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# ----------------------
# 1. Load Dataset
# ----------------------

df = pd.read_csv('customer_reviews.csv')  # Ensure this file is in the same folder
print("Sample Data:")
print(df.head())

# ----------------------
# 2. Preprocessing
# ----------------------

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = nltk.word_tokenize(text)
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words and w.isalpha()]
    return ' '.join(words)

df['Clean_Review'] = df['Review'].apply(clean_text)

# ----------------------
# 3. Feature Extraction
# ----------------------

tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['Clean_Review']).toarray()
y = df['Sentiment']

# ----------------------
# 4. Train-Test Split
# ----------------------

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ----------------------
# 5. Train Models
# ----------------------

log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)
log_preds = log_model.predict(X_test)

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
nb_preds = nb_model.predict(X_test)

# ----------------------
# 6. Evaluation
# ----------------------

def evaluate_model(name, y_true, y_pred):
    print(f"\n--- {name} ---")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("F1 Score:", f1_score(y_true, y_pred, average='weighted'))
    print("Classification Report:\n", classification_report(y_true, y_pred))
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(4, 4))
    plt.title(f'{name} Confusion Matrix')
    plt.imshow(cm, cmap='Blues', interpolation='nearest')
    plt.colorbar()
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.xticks(ticks=[0,1,2], labels=['Negative', 'Neutral', 'Positive'])
    plt.yticks(ticks=[0,1,2], labels=['Negative', 'Neutral', 'Positive'])
    plt.show()

evaluate_model("Logistic Regression", y_test, log_preds)
evaluate_model("Naive Bayes", y_test, nb_preds)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91968\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\91968\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\91968\AppData\Roaming\nltk_data...


Sample Data:
                                 Review Sentiment
0                   I love this product  positive
1                 Worst experience ever  negative
2            It was fine, nothing great   neutral
3  Absolutely fantastic! I’ll buy again  positive
4                   Not worth the money  negative


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\91968/nltk_data'
    - 'c:\\Users\\91968\\anaconda3\\nltk_data'
    - 'c:\\Users\\91968\\anaconda3\\share\\nltk_data'
    - 'c:\\Users\\91968\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\91968\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************
