In [88]:
import pandas as pd

# Load dataset
df = pd.read_csv("Reviews.csv")

#  Required columns
required_columns = ["Summary", "Text", "Sentiment", "ProductId", "UserId", "Time"]
missing_columns = [col for col in required_columns if col not in df.columns]

# Check for missing columns
if missing_columns:
    print(f"Warning: Missing columns detected: {missing_columns}")
    for col in missing_columns:
        if col == "ProductId":
            df["ProductId"] = "Unknown"
        elif col == "UserId":
            df["UserId"] = "Anonymous"
        elif col == "Time":
            df["Time"] = pd.Timestamp.now()  # Default to current timestamp
else:
    print(" All required columns are present.")

# Display updated DataFrame
df.head()



Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [84]:
import hashlib

#  Check if "UserId" exists and anonymize
if "UserId" in df.columns:
    try:
        df["User_Anon"] = df["UserId"].apply(lambda x: hashlib.sha256(str(x).encode()).hexdigest())
        df.drop(columns=["UserId"], inplace=True)  # Remove original user data
        print("User data anonymized successfully.")
    except Exception as e:
        print(f"Error during user anonymization: {e}")
else:
    print(" No 'UserId' column found. Skipping anonymization.")

# Display updated DataFrame
df.head()

 No 'UserId' column found. Skipping anonymization.


Unnamed: 0,Id,ProductId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,User_Anon
0,1,B001E4KFG0,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,c41af2a35a6fecc95a3f04e642d2cb695a7f2e74b34961...
1,2,B00813GRG4,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,08c528d7c4df2cf389546a0064b6958b87bdc0d9dd7982...
2,3,B000LQOCH0,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,eb57c172bde49a93ab76e46ff06486fc1247dedf280f03...
3,4,B000UA0QIQ,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,12cce69c69a5c69c25399b3bb1640f341d09392e1d842b...
4,5,B006K2ZZ7K,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,332bafccefbfdf3f258f73b172a6d169473cbaf593b772...


In [85]:
import numpy as np

# Select only 750 reviews randomly
df_sample = df.sample(n=750, random_state=42).reset_index(drop=True)

# Display sample dataset
df_sample.head()

Unnamed: 0,Id,ProductId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,User_Anon
0,165257,B000EVG8J2,"B. Miller ""pet person""",0,0,5,1268179200,Crunchy & Good Gluten-Free Sandwich Cookies!,Having tried a couple of other brands of glute...,39b3cc85c365c0f25c5c2ea896e32f722bcf1f061bb827...
1,231466,B0000BXJIS,Marty,0,0,5,1298937600,great kitty treats,My cat loves these treats. If ever I can't fin...,be145d5cd9482a5b9a1ed8210085a3b121035609d2e201...
2,427828,B008FHUFAU,Kenneth Shevlin,0,2,3,1224028800,COFFEE TASTE,A little less than I expected. It tends to ha...,8af2d023446033002adf5e957f2c107b3caed45e050994...
3,433955,B006BXV14E,rareoopdvds,0,1,2,1335312000,So the Mini-Wheats were too big?,"First there was Frosted Mini-Wheats, in origin...",619eeee1c8befcd761df16908ae7af038ac34a667c5292...
4,70261,B007I7Z3Z0,Og8ys1,0,2,5,1334707200,Great Taste . . .,and I want to congratulate the graphic artist ...,654597f911702ccfe2cf61941d996e48f6fccf32dc0a06...


In [86]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

#  Download NLTK Stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

#  Function to Clean Text
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = " ".join([stemmer.stem(word) for word in text.split() if word not in stop_words])
    return text

#  Apply Text Cleaning
df_sample["Cleaned_Text"] = df_sample["Text"].apply(clean_text)

#  Display Sample Data
df_sample[["Text", "Cleaned_Text"]].head()

[nltk_data] Downloading package stopwords to /home/sagemaker-
[nltk_data]     user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Text,Cleaned_Text
0,Having tried a couple of other brands of glute...,tri coupl brand glutenfre sandwich cooki best ...
1,My cat loves these treats. If ever I can't fin...,cat love treat ever cant find hous pop top bol...
2,A little less than I expected. It tends to ha...,littl less expect tend muddi tast expect sinc ...
3,"First there was Frosted Mini-Wheats, in origin...",first frost miniwheat origin size frost miniwh...
4,and I want to congratulate the graphic artist ...,want congratul graphic artist put entir produc...


In [87]:
import boto3

#  Connect to AWS Comprehend (Replace 'eu-west-2' with your AWS region)
comprehend = boto3.client(service_name='comprehend', region_name='eu-west-2')

print(" AWS Comprehend Connected Successfully!")

 AWS Comprehend Connected Successfully!


In [89]:
def analyze_sentiment(text):
    if text:
        try:
            response = comprehend.detect_sentiment(Text=text, LanguageCode='en')
            return response['Sentiment'], response['SentimentScore']
        except Exception as e:
            print(f" Error during Comprehend analysis: {e}")
            return "ERROR", {"Positive": 0, "Negative": 0, "Neutral": 0, "Mixed": 0}
    return "NO_TEXT", {"Positive": 0, "Negative": 0, "Neutral": 0, "Mixed": 0}

#  Apply Sentiment Analysis to Cleaned Text
df_sample[['Comprehend_Sentiment', 'Comprehend_Score']] = df_sample['Cleaned_Text'].apply(analyze_sentiment).tolist()

# Display Sentiment Results
df_sample[['Cleaned_Text', 'Comprehend_Sentiment', 'Comprehend_Score']].head()

Unnamed: 0,Cleaned_Text,Comprehend_Sentiment,Comprehend_Score
0,tri coupl brand glutenfre sandwich cooki best ...,POSITIVE,"{'Positive': 0.47346004843711853, 'Negative': ..."
1,cat love treat ever cant find hous pop top bol...,POSITIVE,"{'Positive': 0.7705198526382446, 'Negative': 0..."
2,littl less expect tend muddi tast expect sinc ...,NEUTRAL,"{'Positive': 0.04822135344147682, 'Negative': ..."
3,first frost miniwheat origin size frost miniwh...,NEUTRAL,"{'Positive': 0.23086786270141602, 'Negative': ..."
4,want congratul graphic artist put entir produc...,MIXED,"{'Positive': 0.230007603764534, 'Negative': 0...."


In [90]:
def map_sentiment_score(sentiment, scores):
    if sentiment == "POSITIVE":
        return int(5 * scores["Positive"])  # Scale Positive Score to 5
    elif sentiment == "NEGATIVE":
        return int(5 * scores["Negative"])  # Scale Negative Score to 1
    elif sentiment == "NEUTRAL":
        return 3  # Keep neutral as 3
    elif sentiment == "MIXED":
        return 3  # Mixed sentiment is ambiguous
    elif sentiment == "NO_TEXT":
        return 0  # No text means no rating
    elif sentiment == "ERROR":
        return -1  # Error handling case
    else:
        return 0  # Default case

# Convert AWS Sentiment Scores
df_sample['Sentiment_1_to_5'] = df_sample.apply(lambda row: map_sentiment_score(row['Comprehend_Sentiment'], row['Comprehend_Score']), axis=1)

#  Display Results
df_sample[['Cleaned_Text', 'Comprehend_Sentiment', 'Sentiment_1_to_5']].head(20)

Unnamed: 0,Cleaned_Text,Comprehend_Sentiment,Sentiment_1_to_5
0,tri coupl brand glutenfre sandwich cooki best ...,POSITIVE,2
1,cat love treat ever cant find hous pop top bol...,POSITIVE,3
2,littl less expect tend muddi tast expect sinc ...,NEUTRAL,3
3,first frost miniwheat origin size frost miniwh...,NEUTRAL,3
4,want congratul graphic artist put entir produc...,MIXED,3
5,pleas add pineappl flavor packag lifesav fact ...,NEUTRAL,3
6,absolut love yorkshir tea glad avail amazon cu...,POSITIVE,4
7,hard time find loos tea local abl order favori...,MIXED,3
8,previous ive attempt recip white rice noodl ov...,MIXED,3
9,make pancak waffl everi saturday morn kid simp...,POSITIVE,2


In [91]:
from sklearn.feature_extraction.text import TfidfVectorizer

#  Initialize TF-IDF Vectorizer (limit to 5000 most important words)
vectorizer = TfidfVectorizer(max_features=5000)

#  Transform text into TF-IDF vectors
X_tfidf = vectorizer.fit_transform(df_sample["Cleaned_Text"])

#  Display shape of the TF-IDF feature matrix
print(f"TF-IDF Feature Matrix Shape: {X_tfidf.shape}")

TF-IDF Feature Matrix Shape: (750, 4730)


In [92]:
import pickle

# Store ML Inputs (TF-IDF Features) and Labels (Sentiment Scores)
X = X_tfidf  # ML Model Input (TF-IDF)
y = df_sample["Sentiment_1_to_5"]  # Target Labels (1-5 ratings)

# Save Data for Training
pickle.dump((X, y), open("ml_dataset.pkl", "wb"))
print("ML Dataset Saved Successfully!")

ML Dataset Saved Successfully!


In [93]:
from sklearn.model_selection import train_test_split

#  Split the data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)  # 80% train, 20% temp
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)  # 10% val, 10% test

#  Print Dataset Sizes using .shape[0]
print(f"Training Set: {X_train.shape[0]} reviews")
print(f"Validation Set: {X_val.shape[0]} reviews")
print(f"Test Set: {X_test.shape[0]} reviews")

Training Set: 600 reviews
Validation Set: 75 reviews
Test Set: 75 reviews


In [94]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Train Naïve Bayes Model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)

# Train Support Vector Machine (SVM) Model
svm_model = SVC(kernel="linear")
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

#  Evaluate Models
nb_accuracy = accuracy_score(y_test, y_pred_nb)
svm_accuracy = accuracy_score(y_test, y_pred_svm)

print(f" Naïve Bayes Accuracy: {nb_accuracy:.4f}")
print(f" SVM Accuracy: {svm_accuracy:.4f}")

 Naïve Bayes Accuracy: 0.6400
 SVM Accuracy: 0.6667


In [95]:
# Choose Best Model
if nb_accuracy > svm_accuracy:
    best_model = nb_model
    model_name = "Naïve Bayes"
else:
    best_model = svm_model
    model_name = "SVM"

print(f" Best Model Selected: {model_name}")

 Best Model Selected: SVM


In [96]:
import pickle

#  Save the best model
with open("best_model.pkl", "wb") as model_file:
    pickle.dump(best_model, model_file)

print(f" Best Model ({model_name}) Saved Successfully!")

 Best Model (SVM) Saved Successfully!


In [97]:
# Load the saved model
with open("best_model.pkl", "rb") as model_file:
    loaded_model = pickle.load(model_file)

#  Test the loaded model on validation data
sample_prediction = loaded_model.predict(X_val[:5])
print(f"Sample Predictions: {sample_prediction}")

Sample Predictions: [3 3 3 3 3]


In [99]:
#  Check if the model exists
response = sagemaker.list_models()
models = [model["ModelName"] for model in response["Models"]]

if model_name in models:
    print(f" Model '{model_name}' is registered successfully!")
else:
    print(f" Model '{model_name}' is NOT found. Something went wrong.")

 Model 'sentiment-analysis-model-v5' is registered successfully!
