In [1]:
# Importing necessary libraries for data manipulation and analysis
import pandas as pd # For dataframes and data manipulation
import numpy as np  # For numerical operations
import matplotlib.pyplot as plt  # For data visualization
import seaborn as sns # For enhanced data visualization

# Importing Natural Language Toolkit (NLTK) components for text processing
from nltk.corpus import stopwords # For common stopwords
from nltk.tokenize import word_tokenize # For splitting text into words (tokens)
from nltk.stem import WordNetLemmatizer # For reducing words to their base/dictionary form
from nltk.stem import PorterStemmer # For reducing words to their root form (more aggressive than lemmatization)
import string # For string operations and punctuation characters

# Importing scikit-learn components for machine learning
from sklearn.feature_extraction.text import TfidfVectorizer  # For converting text to TF-IDF features
from sklearn.model_selection import train_test_split # For splitting data into train/test sets
from sklearn.linear_model import LogisticRegression  # For logistic regression classifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score # For model evaluation

# Importing NLTK and downloading necessary datasets/models
import nltk
nltk.download('punkt') # Downloading the Punkt tokenizer models
nltk.download('stopwords') # Downloading common stopwords list
nltk.download('wordnet') #download the WordNet lexical database.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mamat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mamat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mamat\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Read the CSV file from the specified file path into a pandas DataFrame
# The file is a zipped CSV containing product reviews from kraggle

df = pd.read_csv('C:/Users/mamat/Downloads/Reviews.csv.zip')

print(f"Dataset shape: {df.shape}") # Print the shape (dimensions) of the DataFrame

df.head() # Display the first 5 rows of the DataFrame 

Dataset shape: (568454, 10)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [3]:
print(df.info()) #to look at the data information

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      568454 non-null  int64 
 1   ProductId               568454 non-null  object
 2   UserId                  568454 non-null  object
 3   ProfileName             568428 non-null  object
 4   HelpfulnessNumerator    568454 non-null  int64 
 5   HelpfulnessDenominator  568454 non-null  int64 
 6   Score                   568454 non-null  int64 
 7   Time                    568454 non-null  int64 
 8   Summary                 568427 non-null  object
 9   Text                    568454 non-null  object
dtypes: int64(5), object(5)
memory usage: 43.4+ MB
None


In [4]:
print(df.isnull().sum()) #to find out the null value in the data

Id                         0
ProductId                  0
UserId                     0
ProfileName               26
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
dtype: int64


In [5]:
# Select only the 'Text' (review content) and 'Score' (rating) columns from the DataFrame
# We keep only these two columns because:
# - 'Text' contains the actual review text we'll analyze (our feature/X variable)
# - 'Score' contains the numerical rating (typically 1-5 stars) which will be our target/y variable
# This is a common setup for sentiment analysis tasks where we predict rating from text
df = df[['Text', 'Score']]

# Remove any rows that have missing values (NaN) in either column
# This is important because:
# - We can't analyze text that doesn't exist (missing Text)
# - We can't train a model without known ratings (missing Score)
# - Most ML algorithms cannot handle missing values
df = df.dropna()

In [6]:
df #to look at the dataframe

Unnamed: 0,Text,Score
0,I have bought several of the Vitality canned d...,5
1,Product arrived labeled as Jumbo Salted Peanut...,1
2,This is a confection that has been around a fe...,4
3,If you are looking for the secret ingredient i...,2
4,Great taffy at a great price. There was a wid...,5
...,...,...
568449,Great for sesame chicken..this is a good if no...,5
568450,I'm disappointed with the flavor. The chocolat...,2
568451,"These stars are small, so you can give 10-15 o...",5
568452,These are the BEST treats for training and rew...,5


In [7]:
# Count total duplicate rows (all columns identical)
print("Total duplicate rows:", df.duplicated().sum())

Total duplicate rows: 174779


In [8]:
# Count duplicates in just the 'Text' column
print("Duplicate reviews:", df['Text'].duplicated().sum())

Duplicate reviews: 174875


In [9]:
# Remove all completely duplicate rows (where ALL column values are identical)
# This checks for rows where both Text AND Score are duplicates
# keep='first' parameter keeps the first occurrence and drops subsequent duplicates
# Note: The comment indicates there are none in this case, but we check anyway
df = df.drop_duplicates()

In [10]:
# More specifically remove duplicate reviews based only on the 'Text' column
# This ensures we don't have the exact same review text appearing multiple times
# (even if they happen to have different scores, which would be unusual)
# This is important because:
# 1. Identical text should be treated as the same data point
# 2. Prevents the model from overfitting to repeated text
# 3. Reduces bias in cases where the same user might submit duplicate reviews
df = df.drop_duplicates(subset=['Text']) 

In [11]:
#rechecking if the duplicate is removed or not
print("Duplicate reviews:", df['Text'].duplicated().sum())

Duplicate reviews: 0


In [12]:
# Convert all entries in the 'Text' column to string type
# This ensures consistent text processing by:
# 1. Handling any non-string values that might exist in the data
# 2. Guaranteeing all text processing functions will work properly since they expect string input
# 3. Preventing potential errors during text vectorization or NLP operations
df['Text'] = df['Text'].astype(str)