### "Amazon-Alexa" text classification using "Bag of Words and "TF-IDF"

#### https://www.kaggle.com/datasets/sid321axn/amazon-alexa-reviews

### import the required libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
#from sklearn.base import TransformerMixin
#from sklearn.pipeline import Pipeline

### Load the input data ( "amazon alexa reviews data")

In [2]:
# Loading TSV file
df_amazon = pd.read_csv ("amazon_alexa.tsv", sep="\t")

In [3]:
# Top 5 records
df_amazon.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


### Vectorization using sample Dataset

In [4]:
df_amazon_sample =  df_amazon.head()
df_amazon_sample.shape

(5, 5)

In [5]:
data_sample = df_amazon_sample.verified_reviews.tolist()
data_sample

['Love my Echo!',
 'Loved it!',
 'Sometimes while playing a game, you can answer a question correctly but Alexa says you got it wrong and answers the same as you.  I like being able to turn lights on and off while away from home.',
 'I have had a lot of fun with this thing. My 4 yr old learns about dinosaurs, i control the lights and play games like categories. Has nice sound when playing music as well.',
 'Music']

### Steps to follow :
#### 1. Instantiate the vectorizer object
#### 2. convert the documents into a matrix 
#### 3. get the features / corpus

In [6]:
# instantiate the vectorizer object
countvectorizer = CountVectorizer(analyzer= 'word', stop_words='english')
tfidfvectorizer = TfidfVectorizer(analyzer='word',stop_words= 'english')

In [7]:
# convert th documents into a matrix
count_wm = countvectorizer.fit_transform(data_sample)
tfidf_wm = tfidfvectorizer.fit_transform(data_sample)

In [8]:
count_wm.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
        0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0],
       [0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1,
        1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [26]:
tfidf_wm.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.70710678,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.70710678,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        1.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.25860653, 0.25860653, 0.25860653, 0.25860653, 0.25860653,
        0.        , 0.    

In [None]:
#retrieve the terms found in the corpora
# if we take same parameters on both Classes(CountVectorizer and TfidfVectorizer) , it will give same output of get_feature_names() methods)
#count_tokens = tfidfvectorizer.get_feature_names() # no difference
count_tokens = countvectorizer.get_feature_names()
tfidf_tokens = tfidfvectorizer.get_feature_names()

In [None]:
count_tokens

In [None]:
bow_array = count_wm.toarray()
tfidf_array = tfidf_wm.toarray()
df = pd.DataFrame(bow_array, columns = count_tokens)
df

In [None]:
df2 = pd.DataFrame(tfidf_array, columns =  tfidf_tokens)
df2

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

corpus = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

countvectorizer = CountVectorizer()
count_matrix = countvectorizer.fit_transform(corpus)
count_tokens = countvectorizer.get_feature_names()
count_array = count_matrix.toarray()

tfidfvectorizer = TfidfVectorizer()
tfidf_matrix = tfidfvectorizer.fit_transform(corpus)
tfidf_tokens = tfidfvectorizer.get_feature_names()
tfidf_array = tfidf_matrix.toarray()

df_count = pd.DataFrame(count_array, columns=count_tokens)
df_tfidf = pd.DataFrame(tfidf_array, columns=tfidf_tokens)

print(df_count)
print(df_tfidf)


In [42]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

corpus = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

countvectorizer = CountVectorizer()
count_matrix = countvectorizer.fit_transform(corpus)
count_tokens = countvectorizer.get_feature_names()
count_array = count_matrix.toarray()

tfidfvectorizer = TfidfVectorizer()
tfidf_matrix = tfidfvectorizer.fit_transform(corpus)
tfidf_tokens = tfidfvectorizer.get_feature_names()
tfidf_array = tfidf_matrix.toarray()

df_count = pd.DataFrame(count_array, columns=count_tokens)
df_tfidf = pd.DataFrame(tfidf_array, columns=tfidf_tokens)

print(df_count)
print(df_tfidf)


AttributeError: ignored

### Data processing and modeling / classification on 'Amazon-Alexa' full dataset

In [38]:
# convert the documents into a matrix
count_wm = countvectorizer.fit_transform(df_amazon.verified_reviews)
#tfidf_wm = tfidfvectorizer.fit_transform(df_amazon.verified_reviews)

In [37]:
X = count_wm.toarray()
X

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
        0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0],
       [0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1,
        1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [35]:
y = np.array(df_amazon.feedback)
y

array([1, 1, 1, ..., 1, 1, 1])

### Splitting the dataset

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
print(f'X_train dimension: {X_train.shape}')
print(f'y_train dimension: {y_train.shape}')
print(f'X_test dimension: {X_test.shape}')
print(f'y_train dimension: {y_test.shape}')

### Modeling - Classification

In [None]:
# Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

classifier.fit(X_train,y_train)
predicted = classifier.predict(X_test)

In [None]:
from sklearn import metrics
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))