# Headphone dataset review analysis  - Classification #

Analyze how customers expressed their feelings about headphones bought through Amazon

Different model representations are used as base input for classification models.

Data from: https://www.kaggle.com/datasets/mdwaquarazam/headphone-dataset-review-analysis

---

### ***Imports*** ###

In [64]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import seaborn as sns
import importlib

import nlp_data_transformations as ndt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from itertools import product
import copy

In [2]:
df = pd.read_csv('inputs/headphone_datn.csv')

### ***Preprocessing*** ###

In [3]:
data = ndt.Preprocessing(df['COMMENTS'])

In [4]:
data.lower()

0       okay.. i was skeptical at first to buy this as...
1       the earphone is worth what you pay for. the de...
2       particularly for people with sensitive ears, w...
3       built quality 6.5 10the lower wire is a durabl...
4       don't go with the over all 4 start rating. thi...
                              ...                        
1599    quite good sound qualityand had impressively g...
1600                                                  osm
1601    earphones fits well onto the ears, doesn't blo...
1602    sound quality very bad... over all very bad pr...
1603    this is only for calls...mic is good...but com...
Name: COMMENTS, Length: 1604, dtype: object

In [5]:
data.remove_stop_words()

0       okay.. skeptical buy tight budget , honest lit...
1       earphone worth pay for. design earbud cause pa...
2       particularly people sensitive ears, gets ear p...
3       built quality 6.5 10the lower wire durable nyl...
4       don't 4 start rating. boat brand endorse lot m...
                              ...                        
1599         good sound qualityand impressively good bass
1600                                                  osm
1601    earphones fits ears, doesn't block ear outside...
1602        sound quality bad... bad product... happy buy
1603    calls...mic good...but comfort bad music worst...
Name: COMMENTS, Length: 1604, dtype: object

In [6]:
data.remove_digists()

0       okay.. skeptical buy tight budget , honest lit...
1       earphone worth pay for. design earbud cause pa...
2       particularly people sensitive ears, gets ear p...
3       built quality . the lower wire durable nylon k...
4       don't  start rating. boat brand endorse lot mo...
                              ...                        
1599         good sound qualityand impressively good bass
1600                                                  osm
1601    earphones fits ears, doesn't block ear outside...
1602        sound quality bad... bad product... happy buy
1603    calls...mic good...but comfort bad music worst...
Name: COMMENTS, Length: 1604, dtype: object

In [7]:
data.remove_punctuation()

0       okay skeptical buy tight budget  honest little...
1       earphone worth pay for design earbud cause pai...
2       particularly people sensitive ears gets ear pa...
3       built quality  the lower wire durable nylon kn...
4       dont  start rating boat brand endorse lot mone...
                              ...                        
1599         good sound qualityand impressively good bass
1600                                                  osm
1601    earphones fits ears doesnt block ear outside s...
1602              sound quality bad bad product happy buy
1603             callsmic goodbut comfort bad music worst
Name: COMMENTS, Length: 1604, dtype: object

In [8]:
data_tokens = copy.deepcopy(data)

In [9]:
data_tokens.tokenize()

0       [okay, skeptical, buy, tight, budget, honest, ...
1       [earphone, worth, pay, for, design, earbud, ca...
2       [particularly, people, sensitive, ears, gets, ...
3       [built, quality, the, lower, wire, durable, ny...
4       [dont, start, rating, boat, brand, endorse, lo...
                              ...                        
1599    [good, sound, qualityand, impressively, good, ...
1600                                                [osm]
1601    [earphones, fits, ears, doesnt, block, ear, ou...
1602      [sound, quality, bad, bad, product, happy, buy]
1603      [callsmic, goodbut, comfort, bad, music, worst]
Name: COMMENTS, Length: 1604, dtype: object

### ***Text repressentations*** ###

In [10]:
representations = []

#### ***Most common tokens*** ####

In [11]:
ndt.most_common_tokens(data_tokens[:], 10)

Unnamed: 0,good,quality,sound,product,bass,ear,working,boat,price,earphones
0,0,0,1,1,0,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,1,0,0,0,0
3,1,1,1,1,1,1,0,1,1,1
4,1,0,0,1,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1599,1,0,1,0,1,0,0,0,0,0
1600,0,0,0,0,0,0,0,0,0,0
1601,0,0,0,0,0,1,0,0,1,1
1602,0,1,1,1,0,0,0,0,0,0


In [12]:
top_most_common_tokens = [5, 10, 20, 50]
for top in top_most_common_tokens:
    representations.append((f'most_common_tokens_{top}', ndt.most_common_tokens(data_tokens[:], top).to_numpy()))

#### ***Vectorizers*** ####

In [13]:
vectorizers_list = [CountVectorizer, TfidfVectorizer]
min_df_list = [0.01, 0.1, 1]
max_df_list = [0.3, 0.5, 1.0]
max_features_list = [300, 500, 100]

In [14]:
for n, (Vectorizer, min_df, max_df, max_features) in \
    enumerate(product(vectorizers_list, min_df_list, max_df_list, max_features_list)):
        vectorizer_kwargs = {'min_df':min_df, 'max_df':max_df, 'max_features': max_features}
        embeddings = ndt.use_vectorizer(data[:], Vectorizer, vectorizer_kwargs)
        print(vectorizer_kwargs)

{'min_df': 0.01, 'max_df': 0.3, 'max_features': 300}
{'min_df': 0.01, 'max_df': 0.3, 'max_features': 500}
{'min_df': 0.01, 'max_df': 0.3, 'max_features': 100}
{'min_df': 0.01, 'max_df': 0.5, 'max_features': 300}
{'min_df': 0.01, 'max_df': 0.5, 'max_features': 500}
{'min_df': 0.01, 'max_df': 0.5, 'max_features': 100}
{'min_df': 0.01, 'max_df': 1.0, 'max_features': 300}
{'min_df': 0.01, 'max_df': 1.0, 'max_features': 500}
{'min_df': 0.01, 'max_df': 1.0, 'max_features': 100}
{'min_df': 0.1, 'max_df': 0.3, 'max_features': 300}
{'min_df': 0.1, 'max_df': 0.3, 'max_features': 500}
{'min_df': 0.1, 'max_df': 0.3, 'max_features': 100}
{'min_df': 0.1, 'max_df': 0.5, 'max_features': 300}
{'min_df': 0.1, 'max_df': 0.5, 'max_features': 500}
{'min_df': 0.1, 'max_df': 0.5, 'max_features': 100}
{'min_df': 0.1, 'max_df': 1.0, 'max_features': 300}
{'min_df': 0.1, 'max_df': 1.0, 'max_features': 500}
{'min_df': 0.1, 'max_df': 1.0, 'max_features': 100}
{'min_df': 1, 'max_df': 0.3, 'max_features': 300}
{'min

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.27970441],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.61272165,
        0.        ]])

#### ***Word2Vec*** ####

In [67]:
model = Word2Vec(data_tokens[:], window = 3, size = 150)
ndt.use_word2vec_model(model, data_tokens[:])

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


array([[ 0.07245913, -0.10201433, -0.09310213, ...,  0.10923203,
         0.09103949, -0.07374188],
       [ 0.06186942, -0.08502579, -0.07767051, ...,  0.0908356 ,
         0.07667576, -0.06243226],
       [ 0.07311751, -0.10194395, -0.09358924, ...,  0.10963823,
         0.09055211, -0.07266089],
       ...,
       [ 0.07827283, -0.10855212, -0.09853151, ...,  0.11672531,
         0.09624936, -0.07718303],
       [ 0.08094943, -0.11417437, -0.10395207, ...,  0.12159872,
         0.10113773, -0.08214863],
       [ 0.05016395, -0.06951857, -0.06385238, ...,  0.07266359,
         0.0616712 , -0.04873827]], dtype=float32)

#### ***Doc2Vec*** ####

In [77]:
tagged_doccs = ndt.use_doc2vec_model(data_tokens[:])

In [None]:
Doc2Vec(tagged_doccs)