# Import Libraries and Data

In [34]:
# Import Libraries
import pandas as pd
import nltk
import re
import string

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from textblob import TextBlob
from nltk.sentiment import SentimentIntensityAnalyzer

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Define Stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [35]:
# Load Dataset
df = pd.read_csv('Reviews.csv')
df.shape

(568454, 10)

In [36]:
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


# Data Preprocessing

In [37]:
# Check for null value
df.isnull().sum()

Id                         0
ProductId                  0
UserId                     0
ProfileName               26
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
dtype: int64

In [38]:
# Checking Duplicate values
df.duplicated().sum()

0

In [39]:
# Define function to preprocess text
def preprocess_text(text):
    
    # Convert text to lower case
    text = text.lower()
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # Initialize stemmer and lemmatizer
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    
    # Perform stemming and then lemmatization
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in stemmed_tokens]
    
    # Join the tokens back into a string
    processed_text = ' '.join(lemmatized_tokens)
    
    return processed_text

In [40]:
# Apply function to preprocessed text
df['Text'] = df['Text'].apply(preprocess_text)
df

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,bought sever vital can dog food product found ...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,product arriv label jumbo salt peanutsth peanu...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",confect around centuri light pillowi citru gel...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,look secret ingredi robitussin believ found go...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,great taffi great price wide assort yummi taff...
...,...,...,...,...,...,...,...,...,...,...
568449,568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,5,1299628800,Will not do without,great sesam chickenthi good better restur eate...
568450,568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,2,1331251200,disappointed,im disappoint flavor chocol note especi weak m...
568451,568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,5,1329782400,Perfect for our maltipoo,star small give one train session tri train do...
568452,568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,5,1331596800,Favorite Training and reward treat,best treat train reward dog good groom lower c...


In [41]:
# Feature extraction using TF-IDF
vectorizer_tfidf = TfidfVectorizer()
X_tfidf = vectorizer_tfidf.fit_transform(df['Text'])
df

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,bought sever vital can dog food product found ...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,product arriv label jumbo salt peanutsth peanu...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",confect around centuri light pillowi citru gel...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,look secret ingredi robitussin believ found go...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,great taffi great price wide assort yummi taff...
...,...,...,...,...,...,...,...,...,...,...
568449,568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,5,1299628800,Will not do without,great sesam chickenthi good better restur eate...
568450,568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,2,1331251200,disappointed,im disappoint flavor chocol note especi weak m...
568451,568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,5,1329782400,Perfect for our maltipoo,star small give one train session tri train do...
568452,568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,5,1331596800,Favorite Training and reward treat,best treat train reward dog good groom lower c...


# Sentiment Classification using Lexicon-based model

In [42]:
# Function to analyze sentiment using TextBlob
def analyze_sentiment_textblob(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity == 0:
        return 'neutral'
    else:
        return 'negative'

# Function to analyze sentiment using VADER
analyzer = SentimentIntensityAnalyzer()
def analyze_sentiment_vader(text):
    score = analyzer.polarity_scores(text)
    compound = score['compound']
    if compound >= 0.05:
        return 'positive'
    elif compound > -0.05 and compound < 0.05:
        return 'neutral'
    else:
        return 'negative'

In [43]:
# Apply the sentiment analysis functions
df['sentiment_textblob'] = df['Text'].apply(analyze_sentiment_textblob)
df['sentiment_vader'] = df['Text'].apply(analyze_sentiment_vader)
df

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,sentiment_textblob,sentiment_vader
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,bought sever vital can dog food product found ...,positive,positive
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,product arriv label jumbo salt peanutsth peanu...,positive,negative
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",confect around centuri light pillowi citru gel...,positive,positive
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,look secret ingredi robitussin believ found go...,positive,positive
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,great taffi great price wide assort yummi taff...,positive,positive
...,...,...,...,...,...,...,...,...,...,...,...,...
568449,568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,5,1299628800,Will not do without,great sesam chickenthi good better restur eate...,positive,positive
568450,568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,2,1331251200,disappointed,im disappoint flavor chocol note especi weak m...,negative,negative
568451,568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,5,1329782400,Perfect for our maltipoo,star small give one train session tri train do...,positive,positive
568452,568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,5,1331596800,Favorite Training and reward treat,best treat train reward dog good groom lower c...,positive,positive


# Model Evaluation

In [44]:
# Convert scores to sentiment categories
def score_to_sentiment(score):
    if score >= 4:
        return 'positive'
    elif score == 3:
        return 'neutral'
    else:
        return 'negative'

df['ActualSentiment'] = df['Score'].apply(score_to_sentiment)

In [45]:
# Evaluate the models
print("Evaluation for TextBlob:")
print(classification_report(df['ActualSentiment'], df['sentiment_textblob']))
print("Accuracy for TextBlob:", accuracy_score(df['ActualSentiment'], df['sentiment_textblob']))

print("Evaluation for VADER:")
print(classification_report(df['ActualSentiment'], df['sentiment_vader']))
print("Accuracy for VADER:", accuracy_score(df['ActualSentiment'], df['sentiment_vader']))

Evaluation for TextBlob:
              precision    recall  f1-score   support

    negative       0.38      0.26      0.31     82037
     neutral       0.06      0.05      0.05     42640
    positive       0.82      0.89      0.85    443777

    accuracy                           0.74    568454
   macro avg       0.42      0.40      0.41    568454
weighted avg       0.70      0.74      0.71    568454

Accuracy for TextBlob: 0.7350462834283865
Evaluation for VADER:
              precision    recall  f1-score   support

    negative       0.46      0.26      0.33     82037
     neutral       0.09      0.05      0.06     42640
    positive       0.82      0.92      0.87    443777

    accuracy                           0.76    568454
   macro avg       0.46      0.41      0.42    568454
weighted avg       0.71      0.76      0.73    568454

Accuracy for VADER: 0.7624504357432615


# Extract Data into CSV

In [46]:
df

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,sentiment_textblob,sentiment_vader,ActualSentiment
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,bought sever vital can dog food product found ...,positive,positive,positive
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,product arriv label jumbo salt peanutsth peanu...,positive,negative,negative
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",confect around centuri light pillowi citru gel...,positive,positive,positive
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,look secret ingredi robitussin believ found go...,positive,positive,negative
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,great taffi great price wide assort yummi taff...,positive,positive,positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...
568449,568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,5,1299628800,Will not do without,great sesam chickenthi good better restur eate...,positive,positive,positive
568450,568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,2,1331251200,disappointed,im disappoint flavor chocol note especi weak m...,negative,negative,negative
568451,568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,5,1329782400,Perfect for our maltipoo,star small give one train session tri train do...,positive,positive,positive
568452,568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,5,1331596800,Favorite Training and reward treat,best treat train reward dog good groom lower c...,positive,positive,positive


In [47]:
# Drop column that does not in use
df = df.drop(['Id', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Time', 'Summary'], axis=1)
df

Unnamed: 0,ProductId,Score,Text,sentiment_textblob,sentiment_vader,ActualSentiment
0,B001E4KFG0,5,bought sever vital can dog food product found ...,positive,positive,positive
1,B00813GRG4,1,product arriv label jumbo salt peanutsth peanu...,positive,negative,negative
2,B000LQOCH0,4,confect around centuri light pillowi citru gel...,positive,positive,positive
3,B000UA0QIQ,2,look secret ingredi robitussin believ found go...,positive,positive,negative
4,B006K2ZZ7K,5,great taffi great price wide assort yummi taff...,positive,positive,positive
...,...,...,...,...,...,...
568449,B001EO7N10,5,great sesam chickenthi good better restur eate...,positive,positive,positive
568450,B003S1WTCU,2,im disappoint flavor chocol note especi weak m...,negative,negative,negative
568451,B004I613EE,5,star small give one train session tri train do...,positive,positive,positive
568452,B004I613EE,5,best treat train reward dog good groom lower c...,positive,positive,positive


In [48]:
# Reorder columns
df = df[['ProductId', 'Text', 'Score', 'ActualSentiment', 'sentiment_textblob', 'sentiment_vader']]
df

Unnamed: 0,ProductId,Text,Score,ActualSentiment,sentiment_textblob,sentiment_vader
0,B001E4KFG0,bought sever vital can dog food product found ...,5,positive,positive,positive
1,B00813GRG4,product arriv label jumbo salt peanutsth peanu...,1,negative,positive,negative
2,B000LQOCH0,confect around centuri light pillowi citru gel...,4,positive,positive,positive
3,B000UA0QIQ,look secret ingredi robitussin believ found go...,2,negative,positive,positive
4,B006K2ZZ7K,great taffi great price wide assort yummi taff...,5,positive,positive,positive
...,...,...,...,...,...,...
568449,B001EO7N10,great sesam chickenthi good better restur eate...,5,positive,positive,positive
568450,B003S1WTCU,im disappoint flavor chocol note especi weak m...,2,negative,negative,negative
568451,B004I613EE,star small give one train session tri train do...,5,positive,positive,positive
568452,B004I613EE,best treat train reward dog good groom lower c...,5,positive,positive,positive


In [49]:
# Export the dataset with new columns
df.to_csv('Processed_Reviews.csv', index=False)

# Strength and Weakness of Lexicon Based Models for Sentiment Classification 

## Strength : 

### - Lexicons-based methods are easy to implement and understand
### - Lexicons-based methods use dictionary so that it can be finely tuned to the nuances of a language or domain's sentiment expressions.
### - Lexicons-based methods are fast and efficient compared to some machine learning approaches

## Weakness : 

### - As lexicons-based methods use dictionary to predict the sentiment, it cannot predict word that are not in the dictionary
### - Lexicons-based methods struggle with texts containing sarcasm, irony, or subtle jokes
### - Lexicon-based models often ignore the context and syntax in which words are used.