In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import math
from sklearn.model_selection import train_test_split


In [3]:
df = pd.read_csv("../datasets/starbacks_review.csv")

In [4]:
df=pd.DataFrame(df)
display(df.head(5))

Unnamed: 0,name,location,Date,Rating,Review,Image_Links
0,Helen,"Wichita Falls, TX","Reviewed Sept. 13, 2023",5.0,Amber and LaDonna at the Starbucks on Southwes...,['No Images']
1,Courtney,"Apopka, FL","Reviewed July 16, 2023",5.0,** at the Starbucks by the fire station on 436...,['No Images']
2,Daynelle,"Cranberry Twp, PA","Reviewed July 5, 2023",5.0,I just wanted to go out of my way to recognize...,['https://media.consumeraffairs.com/files/cach...
3,Taylor,"Seattle, WA","Reviewed May 26, 2023",5.0,Me and my friend were at Starbucks and my card...,['No Images']
4,Tenessa,"Gresham, OR","Reviewed Jan. 22, 2023",5.0,I’m on this kick of drinking 5 cups of warm wa...,['https://media.consumeraffairs.com/files/cach...


In [5]:
print(df.describe())
print(df.shape)

           Rating
count  705.000000
mean     1.870922
std      1.397672
min      1.000000
25%      1.000000
50%      1.000000
75%      2.000000
max      5.000000
(850, 6)


In [6]:
duplicate_rows = df.duplicated()
print(duplicate_rows.unique())

[False  True]


In [7]:
df=df.drop_duplicates(keep='first')

In [8]:
print(df.isna().sum())

name             0
location         0
Date             0
Rating         144
Review           0
Image_Links      0
dtype: int64


In [9]:
df = df[(df['Review'] != 'No Review Text')]
print(df.shape)
print(df['Rating'].value_counts())
print(df.isna().sum())

(813, 6)
Rating
1.0    450
2.0     98
5.0     83
4.0     39
3.0     33
Name: count, dtype: int64
name             0
location         0
Date             0
Rating         110
Review           0
Image_Links      0
dtype: int64


In [10]:
# Pre-processing the text.
# Text processing

from nltk.tokenize import word_tokenize
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# lowercase the letters on each review row
df['clear_text']=df['Review'].str.lower()

# remove all characters that are not word characters (letters, digits, and underscores) or whitespace characters.
df['clear_text']=df['clear_text'].apply(lambda x : re.sub(r'[^\w\s]', '', x))

# tokenizes the text in the 'Review' column using the word_tokenize function.
df['clear_text']=df['Review'].apply(lambda x : word_tokenize(x))

# define a set of stop words specific to the English language.
# further clean the text data by removing common, less informative words.
stop_words = set(stopwords.words('english'))
df['clear_text'] = df['clear_text'].apply(lambda tokens: [word for word in tokens if word.lower() not in stop_words])

# reducing words to their base or root form.Normalize words and reduce them to a common form to improve text analysis.
stemmer = PorterStemmer()
df['clear_text'] = df['clear_text'].apply(lambda tokens: [stemmer.stem(word) for word in tokens])

display(df.head(5))

Unnamed: 0,name,location,Date,Rating,Review,Image_Links,clear_text
0,Helen,"Wichita Falls, TX","Reviewed Sept. 13, 2023",5.0,Amber and LaDonna at the Starbucks on Southwes...,['No Images'],"[amber, ladonna, starbuck, southwest, parkway,..."
1,Courtney,"Apopka, FL","Reviewed July 16, 2023",5.0,** at the Starbucks by the fire station on 436...,['No Images'],"[*, *, starbuck, fire, station, 436, altamont,..."
2,Daynelle,"Cranberry Twp, PA","Reviewed July 5, 2023",5.0,I just wanted to go out of my way to recognize...,['https://media.consumeraffairs.com/files/cach...,"[want, go, way, recogn, starbuck, employe, bil..."
3,Taylor,"Seattle, WA","Reviewed May 26, 2023",5.0,Me and my friend were at Starbucks and my card...,['No Images'],"[friend, starbuck, card, ’, work, ., thank, wo..."
4,Tenessa,"Gresham, OR","Reviewed Jan. 22, 2023",5.0,I’m on this kick of drinking 5 cups of warm wa...,['https://media.consumeraffairs.com/files/cach...,"[’, kick, drink, 5, cup, warm, water, ., work,..."


In [11]:
filtered=df[~(df['Rating'].isna())]
rating_1=filtered[filtered['Rating']==1.0]
from sklearn.utils import resample
resampled_datasets = []
unique_ratings = [1.0, 2.0, 3.0, 4.0, 5.0]
for rating in unique_ratings:
    rating_data = filtered[filtered["Rating"] == rating]
    resampled_data = resample(rating_data, replace=True, n_samples=len(rating_1), random_state=0)
    resampled_datasets.append(resampled_data)
combined_df = pd.concat(resampled_datasets, ignore_index=True)
print(combined_df.shape)
print(combined_df['Rating'].value_counts())


(2250, 7)
Rating
1.0    450
2.0    450
3.0    450
4.0    450
5.0    450
Name: count, dtype: int64


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X =combined_df['clear_text'].apply(lambda tokens: ' '.join(tokens))
y = combined_df['Rating']

In [13]:
from sklearn.model_selection import train_test_split

# Split the data for train and test
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1687,)
(1687,)
(563,)
(563,)


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the testing data
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [15]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
# Create a Decision Tree model
tree_model = DecisionTreeClassifier()

# Train the Decision Tree model
tree_model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = tree_model.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)

Accuracy: 0.9467140319715808
