In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn

In [None]:
# Load the dataset
df = pd.read_csv("reviews_cleaned.csv")
df = df.sample(n=5000, random_state=42)
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Sentiment
1501,191556,B000WFN0VO,A38VKLWPITZ8XS,"Dallas R. Smith ""Mittens""",3,3,5,1328659200,Kitty Loves This,My 14 year old cat is in chronic renal failure...,positive
2586,334730,B000P5OXYY,A2G3VUSUAOZ5ZB,CCampbell,0,0,5,1341878400,Siracha is amazing!,"If you like spice, you will like Siracha! This...",positive
2653,213834,B001EQ4S9I,A31FPOY737N7A1,Mario A. Magana,3,4,1,1291248000,Half fat = half edible,I really don't know how this can be considered...,negative
1055,180674,B000CQBZQK,A1VBPM0TO2NWCM,May,0,0,1,1348531200,Tastes like medicine!!,Yuck! This has to be one of the worst tasting ...,negative
705,308536,B000LDOUHU,A3U44NNFUUIJ8T,"P. Egeton ""Mom""",1,1,5,1261872000,These are THE originals!,As a first generation American I have visited ...,positive


In [3]:
#Information about the dataset
df.info()
#Summary statistics of the dataset
df.describe()
#returns  number of columns and rows
print(df.shape)
#check for missing values
df.isnull().sum()
#fill the empty rows
df['Summary'] = df['Summary'].fillna("No Summary")


<class 'pandas.core.frame.DataFrame'>
Index: 5000 entries, 1501 to 860
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Id                      5000 non-null   int64 
 1   ProductId               5000 non-null   object
 2   UserId                  5000 non-null   object
 3   ProfileName             5000 non-null   object
 4   HelpfulnessNumerator    5000 non-null   int64 
 5   HelpfulnessDenominator  5000 non-null   int64 
 6   Score                   5000 non-null   int64 
 7   Time                    5000 non-null   int64 
 8   Summary                 4999 non-null   object
 9   Text                    5000 non-null   object
 10  Sentiment               5000 non-null   object
dtypes: int64(5), object(6)
memory usage: 468.8+ KB
(5000, 11)


In [4]:
# Check for duplicates
print(df.duplicated().sum())
#Removes duplicated rows
data_cleaned = df.drop_duplicates()


0


In [None]:
#Handling the Outliers

#calucte the review length(Number of words in the review)
df['review_length'] = df['Text'].apply(lambda x: len(str(x).split()))

df.head()




Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Sentiment,review_length
1501,191556,B000WFN0VO,A38VKLWPITZ8XS,"Dallas R. Smith ""Mittens""",3,3,5,1328659200,Kitty Loves This,My 14 year old cat is in chronic renal failure...,positive,30
2586,334730,B000P5OXYY,A2G3VUSUAOZ5ZB,CCampbell,0,0,5,1341878400,Siracha is amazing!,"If you like spice, you will like Siracha! This...",positive,23
2653,213834,B001EQ4S9I,A31FPOY737N7A1,Mario A. Magana,3,4,1,1291248000,Half fat = half edible,I really don't know how this can be considered...,negative,92
1055,180674,B000CQBZQK,A1VBPM0TO2NWCM,May,0,0,1,1348531200,Tastes like medicine!!,Yuck! This has to be one of the worst tasting ...,negative,85
705,308536,B000LDOUHU,A3U44NNFUUIJ8T,"P. Egeton ""Mom""",1,1,5,1261872000,These are THE originals!,As a first generation American I have visited ...,positive,139


In [6]:
def score_to_sentiment(score):
    if score <= 2:
        return 'negative'
    elif score == 3:
        return 'neutral'
    else:
        return 'positive'

In [7]:
df['Sentiment'] = df['Score'].apply(score_to_sentiment)

In [8]:
print(df['Sentiment'].value_counts())
print(df.head())

Sentiment
positive    3919
negative     703
neutral      378
Name: count, dtype: int64
          Id   ProductId          UserId                ProfileName  \
1501  191556  B000WFN0VO  A38VKLWPITZ8XS  Dallas R. Smith "Mittens"   
2586  334730  B000P5OXYY  A2G3VUSUAOZ5ZB                  CCampbell   
2653  213834  B001EQ4S9I  A31FPOY737N7A1            Mario A. Magana   
1055  180674  B000CQBZQK  A1VBPM0TO2NWCM                        May   
705   308536  B000LDOUHU  A3U44NNFUUIJ8T            P. Egeton "Mom"   

      HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
1501                     3                       3      5  1328659200   
2586                     0                       0      5  1341878400   
2653                     3                       4      1  1291248000   
1055                     0                       0      1  1348531200   
705                      1                       1      5  1261872000   

                       Summary  \
1501         

In [9]:
# Map sentiment to numbers
sentiment_map = {'negative': -1, 'neutral': 0, 'positive': 1}
df['Sentiment_Label'] = df['Sentiment'].map(sentiment_map)

# Reorder columns to place Sentiment_Label next to Sentiment
sentiment_col = df.pop('Sentiment_Label')
df.insert(df.columns.get_loc('Sentiment') + 1, 'Sentiment_Label', sentiment_col)

# Preview
print(df[['Sentiment', 'Sentiment_Label']].head())


     Sentiment  Sentiment_Label
1501  positive                1
2586  positive                1
2653  negative               -1
1055  negative               -1
705   positive                1


In [10]:
df.to_csv('reviews_cleaned.csv', index=False)

In [11]:

for column in df.columns:
    print(f"Unique values in '{column}':")
    print(df[column].value_counts())
    print("-" * 50)

Unique values in 'Id':
Id
268188    1
191556    1
334730    1
213834    1
180674    1
         ..
64754     1
510584    1
307726    1
530912    1
4830      1
Name: count, Length: 5000, dtype: int64
--------------------------------------------------
Unique values in 'ProductId':
ProductId
B007JFMH8M    13
B000VK8AVK    10
B001RVFEP2    10
B000PDWBKO     9
B000NMJWZO     8
              ..
B000YVGMZW     1
B000GZ3B1O     1
B0038336IE     1
B003G52BN0     1
B0014UAHQM     1
Name: count, Length: 3805, dtype: int64
--------------------------------------------------
Unique values in 'UserId':
UserId
A1YUL9PCJR3JTY    8
AY12DBB0U420B     5
A281NPSIMI1C2R    4
A29JUMRL1US6YP    4
A2YNIKQDLZR2WW    4
                 ..
A35I3AP6DKMSOE    1
AGXGHEGTN9V1C     1
A379PRUM0LCDW8    1
A3B5GS7QHNV2NP    1
A1S1Y44MYI883H    1
Name: count, Length: 4810, dtype: int64
--------------------------------------------------
Unique values in 'ProfileName':
ProfileName
O. Brown "Ms. O. Khannah-Brown"           8


In [15]:
import string
import re  

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from num2words import num2words 



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DikelediMaholo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
# Initialize stopwords and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [None]:
def clean_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'\d+', lambda match: num2words(int(match.group())), text)  # Convert digits to words
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

# Apply the function to the 'Text' column (adjusting the column name)
df['clean_review'] = df['Text'].apply(clean_text)

# Verify the output
print(df[['Text', 'clean_review']].head())  # Check the first few rows

                                                   Text  \
1501  My 14 year old cat is in chronic renal failure...   
2586  If you like spice, you will like Siracha! This...   
2653  I really don't know how this can be considered...   
1055  Yuck! This has to be one of the worst tasting ...   
705   As a first generation American I have visited ...   

                                           clean_review  
1501  fourteen year old cat chronic renal failure on...  
2586  like spice like siracha bulk set awesome use l...  
2653  really dont know considered food love original...  
1055  yuck one worst tasting teas ever actually real...  
705   first generation american visited family irela...  


In [17]:
df.head()


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Sentiment,Sentiment_Label,review_length,clean_review
1501,191556,B000WFN0VO,A38VKLWPITZ8XS,"Dallas R. Smith ""Mittens""",3,3,5,1328659200,Kitty Loves This,My 14 year old cat is in chronic renal failure...,positive,1,30,fourteen year old cat chronic renal failure on...
2586,334730,B000P5OXYY,A2G3VUSUAOZ5ZB,CCampbell,0,0,5,1341878400,Siracha is amazing!,"If you like spice, you will like Siracha! This...",positive,1,23,like spice like siracha bulk set awesome use l...
2653,213834,B001EQ4S9I,A31FPOY737N7A1,Mario A. Magana,3,4,1,1291248000,Half fat = half edible,I really don't know how this can be considered...,negative,-1,92,really dont know considered food love original...
1055,180674,B000CQBZQK,A1VBPM0TO2NWCM,May,0,0,1,1348531200,Tastes like medicine!!,Yuck! This has to be one of the worst tasting ...,negative,-1,85,yuck one worst tasting teas ever actually real...
705,308536,B000LDOUHU,A3U44NNFUUIJ8T,"P. Egeton ""Mom""",1,1,5,1261872000,These are THE originals!,As a first generation American I have visited ...,positive,1,139,first generation american visited family irela...
