In [None]:
# %pip install nltk

In [1]:
import pandas as pd                 # For uploading data

# from google.colab import files      # Uploading data file in google colab
# uploaded = files.upload()

In [2]:
df = pd.read_csv(r'amazon_alexa.csv')                # Loading dataset
df.head(5)

Unnamed: 0.1,Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,4,5,31-Jul-18,Charcoal Fabric,Music,1


In [3]:
df = df.rename(columns={'Unnamed: 0': 'index'})    # Rename the column
df = df.set_index('index')   # Set the renamed column as index
df.head(5)

Unnamed: 0_level_0,rating,date,variation,verified_reviews,feedback
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [4]:
df.isnull().sum()                   # handling null values

rating              0
date                0
variation           0
verified_reviews    1
feedback            0
dtype: int64

In [5]:
df.dropna(inplace=True)  # Drops rows with any null values
print(df.isnull().sum())

# df.fillna(value, inplace=True)  # Fills null values with the specified value
# df.interpolate(inplace=True)  # Performs linear interpolation to fill null values
# df['column_name'].fillna(df['column_name'].mean(), inplace=True)  # Fills null values with the mean of the column
# df.drop(columns=['column_name'], inplace=True)  # Drops the specified column

rating              0
date                0
variation           0
verified_reviews    0
feedback            0
dtype: int64


In [6]:
print(df.duplicated().sum())

715


In [7]:
df.drop_duplicates(keep='first', inplace=True)  # Keeps the first occurrence of each duplicated row
print(df.duplicated().sum())

# df.drop_duplicates(keep='last', inplace=True)  # Keeps the last occurrence of each duplicated row

0


In [8]:
df.shape                            # no. of rows and columns in dataset

(2434, 5)

In [9]:
df['feedback'].value_counts()      # total no. of positive and negative sentiments

feedback
1    2214
0     220
Name: count, dtype: int64

In [10]:
df.info()                           # info about dataset

<class 'pandas.core.frame.DataFrame'>
Index: 2434 entries, 0 to 2800
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   rating            2434 non-null   int64 
 1   date              2434 non-null   object
 2   variation         2434 non-null   object
 3   verified_reviews  2434 non-null   object
 4   feedback          2434 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 114.1+ KB


In [12]:
# Convert object column to datetime
df['date'] = pd.to_datetime(df['date'])
print(df.dtypes)

rating                       int64
date                datetime64[ns]
variation                   object
verified_reviews            object
feedback                     int64
dtype: object


  df['date'] = pd.to_datetime(df['date'])


In [13]:
df.head(5)

Unnamed: 0_level_0,rating,date,variation,verified_reviews,feedback
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,5,2018-07-31,Charcoal Fabric,Love my Echo!,1
1,5,2018-07-31,Charcoal Fabric,Loved it!,1
2,4,2018-07-31,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,2018-07-31,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,2018-07-31,Charcoal Fabric,Music,1


In [16]:
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

sw = stopwords.words('english')
lm = WordNetLemmatizer()

def simple_tokenizer(text):
    return re.findall(r'\b\w+\b', text.lower())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saran\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\saran\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\saran\AppData\Roaming\nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saran\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [26]:
msg = []
for text in df['verified_reviews']:
    if isinstance(text, str):
        t = text.lower()                       # lowercase
        t = re.sub('[^A-Za-z0-9]', ' ', t)     # remove punctuation
        tokens = simple_tokenizer(t)           # tokenize into words
        tokens = [lm.lemmatize(i) for i in tokens if i not in sw]  # remove stopwords + lemmatize
        t = " ".join(tokens)                   # join back to string
        msg.append(t)
    else:
        msg.append("")
print(msg[:5])  # Print first 5 preprocessed reviews

['love echo', 'loved', 'sometimes playing game answer question correctly alexa say got wrong answer like able turn light away home', 'lot fun thing 4 yr old learns dinosaur control light play game like category nice sound playing music well', 'music']


In [36]:
# Transform the words into vectors using Count Vectorizer

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(
    max_features=5000,       # limit vocabulary size
    stop_words='english',    # remove stopwords
    token_pattern=r'\b[a-zA-Z]{2,}\b'  # only words with 2+ letters
)
cv.fit(msg)

X_text = cv.transform(msg).toarray()
print(cv.get_feature_names_out()[:20])  # first 20 features

['abay' 'abc' 'abd' 'ability' 'able' 'abode' 'absolutely' 'absolutly' 'ac'
 'accent' 'acceptable' 'accepting' 'access' 'accessable' 'accessible'
 'accessing' 'accessory' 'accident' 'accidentally' 'accompanying']


In [37]:
print(X_text.shape)   # (num_reviews, vocab_size)
print(X_text[0])      # vector for first review

(2434, 3319)
[0 0 0 ... 0 0 0]


In [34]:
import pandas as pd

# --- Text-based features ---
# Length of each review (number of characters)
df['review_char_len'] = df['verified_reviews'].apply(lambda x: len(str(x)))

# Number of words in each review
df['review_word_len'] = df['verified_reviews'].apply(lambda x: len(str(x).split()))

# --- Date-based features ---
# Extract year, month, day from the datetime column
df['review_year'] = df['date'].dt.year
df['review_month'] = df['date'].dt.month
df['review_day'] = df['date'].dt.day

# --- Variation (categorical feature) ---
# Encode product variation (e.g., "Charcoal Fabric") using one-hot encoding
variation_encoded = pd.get_dummies(df['variation'], prefix='variation')

# --- Combine engineered features ---
numeric_features = df[['rating', 'review_char_len', 'review_word_len',
                       'review_year', 'review_month', 'review_day',]]

# Concatenate numeric + categorical
engineered_features = pd.concat([numeric_features, variation_encoded], axis=1)

In [48]:
# Convert engineered features to numpy
# X_numeric = engineered_features.values

# from scipy.sparse import hstack, csr_matrix

# # Convert numeric features to sparse
# X_numeric_sparse = csr_matrix(X_numeric)

# # Concatenate text + numeric features
# X = hstack([X_text, X_numeric_sparse])

y = df['feedback']


In [41]:
# Splitting data into training and test data

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X_text,y,test_size = 0.25,random_state = 0)

In [42]:
#Evaluating classification_report and model score

from sklearn.metrics import confusion_matrix, classification_report

def eval_model(ytest,ypred):                          # evaluate confusion matrix and classification_report
  cm = confusion_matrix(ytest,ypred)
  print('\nconfusion matrix:\n', cm)
  print(classification_report(ytest,ypred))
  #print('Accuracy Score:', accuracy_score(ytest,ypred)) # display accuracy score of the model

def mscore(model):                                    # training and testing score to know how good the model works
  print('Train Score',model.score(x_train,y_train))
  print('Test Score',model.score(x_test,y_test))

In [43]:
# Applying Multinomial Naïve Bayes Classification

from sklearn.naive_bayes import MultinomialNB

mnb1 = MultinomialNB()
mnb1.fit(x_train,y_train)
mscore(mnb1)

ypred_mnb = mnb1.predict(x_test)
print('\npredicted Feedback for test data:\n', ypred_mnb)

eval_model(y_test,ypred_mnb)


Train Score 0.9528767123287671
Test Score 0.9211822660098522

predicted Feedback for test data:
 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 0 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 

In [44]:
# Applying Logistic Regression

from sklearn.linear_model import LogisticRegression

m1 = LogisticRegression()         
m1.fit(x_train,y_train)           
mscore(m1)                        

ypred_m1 = m1.predict(x_test)     
print('\npredicted feedback for test data:\n',ypred_m1)

eval_model(y_test,ypred_m1)

Train Score 0.9698630136986301
Test Score 0.9146141215106732

predicted feedback for test data:
 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1
 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1
 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 

In [45]:
# Applying KNN Classification

from sklearn.neighbors import KNeighborsClassifier

m2 = KNeighborsClassifier(n_neighbors = 5)
m2.fit(x_train,y_train)
mscore(m2)

ypred_m2 = m2.predict(x_test)
print('\npredicted feedback for test data:\n',ypred_m2)

eval_model(y_test,ypred_m2)

Train Score 0.9205479452054794
Test Score 0.9080459770114943

predicted feedback for test data:
 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 

In [46]:
# Applying Multinomial Naïve Bayes Classification

from sklearn.tree import DecisionTreeClassifier

mnb2 = DecisionTreeClassifier()
mnb2.fit(x_train,y_train)
mscore(mnb2)

ypred_mnb = mnb2.predict(x_test)
print('\npredicted Feedback for test data:\n', ypred_mnb)

eval_model(y_test,ypred_mnb)

Train Score 0.9906849315068493
Test Score 0.8998357963875205

predicted Feedback for test data:
 [1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1
 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0
 1 1 1 0 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 0 1 1 1 0 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 0 1
 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1
 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1
 1 1 1 1 1 1 1 0 0 0 1 1 1 1 0 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 