In [1]:
# Importing necessary libraries for data manipulation and analysis
import pandas as pd # For dataframes and data manipulation
import numpy as np  # For numerical operations
import matplotlib.pyplot as plt  # For data visualization
import seaborn as sns # For enhanced data visualization

# Importing Natural Language Toolkit (NLTK) components for text processing
from nltk.corpus import stopwords # For common stopwords
from nltk.tokenize import word_tokenize # For splitting text into words (tokens)
from nltk.stem import WordNetLemmatizer # For reducing words to their base/dictionary form
from nltk.stem import PorterStemmer # For reducing words to their root form (more aggressive than lemmatization)
import string # For string operations and punctuation characters

# Importing scikit-learn components for machine learning
from sklearn.feature_extraction.text import TfidfVectorizer  # For converting text to TF-IDF features
from sklearn.model_selection import train_test_split # For splitting data into train/test sets
from sklearn.linear_model import LogisticRegression  # For logistic regression classifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score # For model evaluation

# Importing NLTK and downloading necessary datasets/models
import nltk
nltk.download('punkt') # Downloading the Punkt tokenizer models
nltk.download('stopwords') # Downloading common stopwords list
nltk.download('wordnet') #download the WordNet lexical database.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mamat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mamat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mamat\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
df = pd.read_csv('C:/Users/mamat/Downloads/Reviews.csv.zip')

In [4]:
from tqdm import tqdm # Import tqdm library for progress bars
tqdm.pandas() # Enable pandas integration with tqdm progress bars

In [None]:
# Define the text preprocessing function (same as before)  
def preprocess_text(text):
    text = text.lower()  # Convert text to lowercase
    tokens = word_tokenize(text)  # Tokenize the text
    tokens = [word for word in tokens if word.isalnum() and word not in stopwords.words('english')]  # Remove stopwords and non-alphanumeric
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]  # Apply stemming
    return ' '.join(tokens)

# Apply preprocessing to the entire 'Text' column with progress bar
df['cleaned_text'] = df['Text'].progress_apply(preprocess_text)

# Preview result
print(df[['Text', 'cleaned_text']].head())

 15%|█▍        | 85204/568454 [1:23:36<32:15:27,  4.16it/s]

In [None]:
# Define a function to convert numerical scores to sentiment categories
def label_sentiment(score):
    """
    Converts 1-5 star ratings into sentiment categories:
    - 4-5 stars → Positive
    - 3 stars → Neutral 
    - 1-2 stars → Negative
    """
    if score >= 4:
        return 'Positive'  # 4 or 5 star reviews are positive
    elif score == 3:
        return 'Neutral' # 3 star reviews are neutral
    else:
        return 'Negative' # 1 or 2 star reviews are negative

df['Sentiment'] = df['Score'].apply(label_sentiment) # Apply the labeling function to create a new 'Sentiment' column

# Check label distribution
print(df['Sentiment'].value_counts())
# This shows how many reviews fall into each category
# Helps identify class imbalance issues before modeling

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer # Import TF-IDF Vectorizer from scikit-learn

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)
# Limit to top 5000 most important words/features
# Other default but important parameters being used:
# - lowercase=True (already handled in our preprocessing)
# - tokenizer=word_tokenize (but we pre-tokenized)
# - stop_words='english' (already removed in preprocessing)
# - ngram_range=(1,1) (only single words by default)

X = vectorizer.fit_transform(df['cleaned_text']) # Transform cleaned text into TF-IDF feature matrix
# What this does:
# 1. fit_transform() learns the vocabulary (all unique words)
# 2. Computes inverse document frequency (IDF) weights
# 3. Transforms text to numerical feature vectors
# Result is a sparse matrix where:
# - Rows = reviews
# - Columns = word features (5000 dimensions)
# - Values = TF-IDF scores (importance of word in document)

# Target labels
y = df['Sentiment'] # Our Positive/Neutral/Negative labels

In [None]:


# Split into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 20% of data for testing (80% for training)
# 42=Seed for reproducible splits

# Resulting variables:
# X_train: Feature matrix for training (4000 samples if n=5000)
# X_test: Feature matrix for testing (1000 samples)
# y_train: Labels for training
# y_test: Labels for testing (ground truth for evaluation)


In [None]:
# Train the model
# Initialize the Logistic Regression model with key parameters:
model = LogisticRegression(max_iter=1000) # Maximum number of iterations for solver to converge
model.fit(X_train, y_train) # Train the model on our training data

# After training, the model can:
# Make predictions on new text data
# Show which words are most important for each sentiment
# Estimate prediction probabilities

In [None]:
# Generate predictions on the test set
y_pred = model.predict(X_test)
# This applies the trained model to the unseen test data
# Returns predicted sentiment labels (Positive/Neutral/Negative)

# Evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred)) #Accuracy Score (Overall correctness)
# What this shows:
# - Percentage of correctly classified reviews
# - Simple overall measure but can be misleading with class imbalance

print("\nClassification Report:\n", classification_report(y_test, y_pred)) #Detailed Classification Report
# Provides key metrics for EACH CLASS:
# - Precision: % of correct predictions for each sentiment 
#   (e.g., when predicting Positive, how often correct)
# - Recall: % of actual cases captured for each sentiment
#   (e.g., what % of true Positive reviews were identified)
# - F1-score: Harmonic mean of precision and recall
# - Support: Number of actual occurrences in test set

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred, labels=['Positive', 'Neutral', 'Negative'])

# Create visualization
plt.figure(figsize=(6,4)) # Set figure size for readability
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Positive', 'Neutral', 'Negative'],  # x-axis labels
            yticklabels=['Positive', 'Neutral', 'Negative']) # y-axis labels

plt.title('Confusion Matrix') # Add titles
plt.xlabel('Predicted Label') # Add labels
plt.ylabel('True Label') # Add labels
plt.show() #to view visuals

In [None]:
# Sentiment Distribution
# Plot a bar chart showing the count of each sentiment category
df['Sentiment'].value_counts().plot(kind='bar', color=['green', 'gray', 'red'])
# Colors are intuitively chosen: green for positive, red for negative, gray for neutral
plt.title('Sentiment Distribution') # Add chart title and axis labels for better readability
plt.xlabel('Sentiment') # Label for x-axis (sentiment categories)
plt.ylabel('Count') # Label for y-axis (number of occurrences)
plt.show() # Display the plot

In [None]:
# Save cleaned data with sentiment labels
df.to_csv('processed_reviews.csv', index=False)

# Save predictions
results = pd.DataFrame({'True Sentiment': y_test, 'Predicted Sentiment': y_pred})
results.to_csv('predictions.csv', index=False)