In [1]:
# ------------------------------
# 1️⃣ Import required libraries
# ------------------------------
import pandas as pd
import numpy as np
import re
import nltk

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# ------------------------------
# 2️⃣ Download required NLTK datasets
# ------------------------------
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# ------------------------------
# 3️⃣ Load dataset
# ------------------------------
# Replace the file path with your actual CSV path
df = pd.read_csv(r"C:\Users\Ramya\Downloads\Reviews.csv.zip")  # Example: reviews of products

# Keep only important columns
df = df[['Text', 'Score']]

# ------------------------------
# 4️⃣ Convert score into sentiment labels
# ------------------------------
def score_to_sentiment(score):
    if score <= 2:
        return 'negative'
    elif score == 3:
        return 'neutral'
    else:
        return 'positive'

df['Sentiment'] = df['Score'].apply(score_to_sentiment)

# ------------------------------
# 5️⃣ Clean text data
# ------------------------------
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()                              # lowercase
    text = re.sub(r'[^a-z\s]', '', text)             # remove special chars & numbers
    words = nltk.word_tokenize(text)                 # tokenize
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]  # remove stopwords & lemmatize
    return ' '.join(words)

df['Cleaned_Text'] = df['Text'].apply(clean_text)

# ------------------------------
# 6️⃣ Encode Sentiment Labels to Numbers
# ------------------------------
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Label'] = le.fit_transform(df['Sentiment'])

# ------------------------------
# 7️⃣ Display sample data
# ------------------------------
print(df.head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ramya\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ramya\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ramya\AppData\Roaming\nltk_data...


FileNotFoundError: [Errno 2] No such file or directory: 'Reviews.csv'

In [2]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob, Word
from sklearn import model_selection, preprocessing, linear_model
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings

# Ignore warnings
warnings.filterwarnings("ignore")

# Download NLTK resources
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

# Load the CSV from your path
df = pd.read_csv(r"C:\Users\Ramya\Downloads\Reviews.csv.zip")

# Keep only the columns we need
df = df[['Text', 'Score']]

# Remove rows with missing values
df.dropna(inplace=True)

# Convert Score to Sentiment
def score_to_sentiment(score):
    if score in [4, 5]:
        return 'positive'
    elif score in [1, 2]:
        return 'negative'
    else:
        return None  # Ignore 3-star reviews

df['Sentiment'] = df['Score'].apply(score_to_sentiment)

# Drop rows with None sentiment (Score = 3)
df = df[df['Sentiment'].notnull()]

# Reset index
df.reset_index(drop=True, inplace=True)

# Check first few rows
print(df.head())

# Check class balance
print("\nSentiment Counts:")
print(df['Sentiment'].value_counts())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ramya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ramya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ramya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                                Text  Score Sentiment
0  I have bought several of the Vitality canned d...      5  positive
1  Product arrived labeled as Jumbo Salted Peanut...      1  negative
2  This is a confection that has been around a fe...      4  positive
3  If you are looking for the secret ingredient i...      2  negative
4  Great taffy at a great price.  There was a wid...      5  positive

Sentiment Counts:
Sentiment
positive    443777
negative     82037
Name: count, dtype: int64


In [3]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# 1️⃣ Keep only the 'Text' and 'Score' columns
df = df[['Text', 'Score']]

# 2️⃣ Remove missing values
df.dropna(inplace=True)

# 3️⃣ Convert to lowercase
df['Text'] = df['Text'].str.lower()

# 4️⃣ Remove punctuation and numbers
df['Text'] = df['Text'].apply(lambda x: re.sub(r'[^a-z\s]', '', x))

# 5️⃣ Remove stopwords
stop_words = set(stopwords.words('english'))
df['Text'] = df['Text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# 6️⃣ Lemmatization
lemmatizer = WordNetLemmatizer()
df['Text'] = df['Text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

# 7️⃣ Check the cleaned text
print(df.head())


                                                Text  Score
0  bought several vitality canned dog food produc...      5
1  product arrived labeled jumbo salted peanutsth...      1
2  confection around century light pillowy citrus...      4
3  looking secret ingredient robitussin believe f...      2
4  great taffy great price wide assortment yummy ...      5


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 1️⃣ Create TF-IDF vectorizer (limit to top 5000 words for efficiency)
vectorizer = TfidfVectorizer(max_features=5000)

# 2️⃣ Fit on the cleaned text and transform into a feature matrix
X = vectorizer.fit_transform(df['Text'])

# 3️⃣ Labels (target variable)
y = df['Score']

# 4️⃣ Check the shape of the feature matrix
print("Feature matrix shape:", X.shape)  # (rows = reviews, columns = words)

# 5️⃣ See first 5 words in vocabulary
print("Sample vocabulary words:", vectorizer.get_feature_names_out()[:5])

# 6️⃣ Show the first review as a TF-IDF vector
print("First review vector:\n", X[0].toarray())


Feature matrix shape: (525814, 5000)
Sample vocabulary words: ['ability' 'able' 'abr' 'absolute' 'absolutely']
First review vector:
 [[0. 0. 0. ... 0. 0. 0.]]


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 1️⃣ Split into train & test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 2️⃣ Train Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# 3️⃣ Predict on test set
y_pred = nb_model.predict(X_test)

# 4️⃣ Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# 5️⃣ Try on a new review
new_review = ["This coffee tastes amazing and fresh!"]
new_review_tfidf = vectorizer.transform(new_review)
print("\nPredicted Score for new review:", nb_model.predict(new_review_tfidf)[0])


Accuracy: 0.7244087749493643

Classification Report:
               precision    recall  f1-score   support

           1       0.77      0.32      0.45     10447
           2       0.59      0.01      0.02      5932
           4       0.63      0.03      0.06     16127
           5       0.72      0.99      0.84     72657

    accuracy                           0.72    105163
   macro avg       0.68      0.34      0.34    105163
weighted avg       0.71      0.72      0.63    105163


Confusion Matrix:
 [[ 3361    38    34  7014]
 [  582    58   113  5179]
 [  171     1   517 15438]
 [  249     1   162 72245]]

Predicted Score for new review: 5
