In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
# df = pd.read_csv("Ecommerce_data.csv",header=None)
df = pd.read_csv("Ecommerce_data.csv")
# df.columns=["label", 'text']
df.shape

(24000, 2)

In [14]:
df.head(5)

Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories


In [15]:
df["label"].value_counts()

Household                 6000
Electronics               6000
Clothing & Accessories    6000
Books                     6000
Name: label, dtype: int64

In [16]:
df.isnull().sum()

Text     0
label    0
dtype: int64

In [17]:
df["label_num"] = df["label"].map({
    "Household": 0,
    'Books': 1,
    "Electronics": 2,
    "Clothing & Accessories": 3})
df.head()

Unnamed: 0,Text,label,label_num
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,0
1,"Contrast living Wooden Decorative Box,Painted ...",Household,0
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,2
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,3
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,3


In [18]:
# using preprocessing text
import spacy
nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

In [20]:
df["preprocessed_txt"] = df["Text"].apply(preprocess)

In [21]:
df

Unnamed: 0,Text,label,label_num,preprocessed_txt
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,0,Urban Ladder Eisner low Study Office Computer ...
1,"Contrast living Wooden Decorative Box,Painted ...",Household,0,contrast live Wooden Decorative Box Painted Bo...
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,2,IO Crest SY PCI40010 PCI raid Host Controller ...
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,3,ISAKAA Baby Socks bear 8 Years- Pack 4 6 8 12 ...
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,3,Indira Designer Women Art Mysore Silk Saree Bl...
...,...,...,...,...
23995,Marvel Physics MCQ's for MHT - CET,Books,1,Marvel Physics MCQ MHT CET
23996,Internet Download Manager | Lifetime License |...,Books,1,internet Download Manager | Lifetime License |...
23997,Sadhubela's Handcrafted Iron Degchi Handi Pot ...,Household,0,Sadhubela Handcrafted Iron Degchi Handi Pot Dh...
23998,Audio-Technica AT-LP60 Automatic Belt Driven D...,Electronics,2,Audio Technica LP60 Automatic Belt Driven dj t...


In [50]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.Text, 
    df.label_num,
    test_size = 0.2,
    random_state = 2022, 
    stratify=df.label_num
)

### attempt 1

In [51]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ("Vectorizer_tfidf", TfidfVectorizer()),
    ("Multi NB", MultinomialNB())
])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

import pickle
filename = 'nive_bayes.sav'
with open(filename, 'wb') as file:
    pickle.dump(clf, file)

              precision    recall  f1-score   support

           0       0.92      0.96      0.94      1200
           1       0.98      0.92      0.95      1200
           2       0.97      0.97      0.97      1200
           3       0.97      0.99      0.98      1200

    accuracy                           0.96      4800
   macro avg       0.96      0.96      0.96      4800
weighted avg       0.96      0.96      0.96      4800



### spacy word vector 

In [24]:
nlp = spacy.load("en_core_web_lg")

In [44]:
df["vector"] = df['preprocessed_txt'].apply(lambda text: nlp(text).vector)

In [26]:
df.head()

Unnamed: 0,Text,label,label_num,preprocessed_txt,vector
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,0,Urban Ladder Eisner low Study Office Computer ...,"[-1.4392173, 1.2322634, -2.273993, 0.97961915,..."
1,"Contrast living Wooden Decorative Box,Painted ...",Household,0,contrast live Wooden Decorative Box Painted Bo...,"[-2.622166, -1.3256572, -0.6779492, 0.16823775..."
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,2,IO Crest SY PCI40010 PCI raid Host Controller ...,"[-0.4627019, 0.44438642, -1.134928, 2.2068477,..."
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,3,ISAKAA Baby Socks bear 8 Years- Pack 4 6 8 12 ...,"[-1.8435141, 0.44474095, -3.9212053, 0.4848093..."
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,3,Indira Designer Women Art Mysore Silk Saree Bl...,"[-2.1705642, -1.6045142, 1.3854641, -0.5505861..."


In [45]:
X_train, X_test, y_train, y_test = train_test_split(
    df.vector.values,
    df.label_num,
    test_size = 0.2,
    random_state=2022
)

In [46]:
X_train

array([array([-2.0110219 ,  0.29700336, -2.0252    ,  2.806578  ,  0.34821546,
              -0.5567222 ,  1.0967256 ,  2.4004376 , -1.8195044 , -2.7795434 ,
              -0.02692562, -0.9998121 , -0.99972224,  0.5445535 , -0.2385189 ,
              -2.1097655 ,  1.6331289 ,  1.7362334 , -0.13627216,  0.99688774,
              -0.4646667 ,  1.4832389 ,  0.12066444,  1.428818  , -2.1776211 ,
               0.23449887, -2.2761664 , -1.3172022 , -0.5065711 , -0.8857531 ,
               0.19751327, -0.63820446, -0.4777189 , -3.2014568 , -0.5428066 ,
              -2.4098887 ,  0.85351443, -0.06973331,  0.81136775,  0.13047218,
               1.6997164 , -0.07518331, -3.1738567 ,  1.783911  , -0.3784333 ,
              -0.9071262 , -0.39553377,  2.639501  ,  0.18568671,  0.01405891,
               1.3728933 , -0.27480447, -0.82680225,  1.5900234 ,  1.4655917 ,
               0.12182508,  0.6905112 ,  1.6347854 , -1.8865877 , -1.7591956 ,
              -0.36734048,  1.64174   , -2.6944153 ,

In [47]:
import numpy as np 
X_train_2d = np.stack(X_train)
X_test_2d  = np.stack(X_test)

In [48]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2d)
scaled_test_embed = scaler.transform(X_test_2d)

clf =MultinomialNB()
clf.fit(scaled_train_embed, y_train)
y_pred = clf.predict(scaled_test_embed)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.86      0.80      1228
           1       0.96      0.89      0.92      1190
           2       0.90      0.86      0.88      1179
           3       0.88      0.85      0.86      1203

    accuracy                           0.86      4800
   macro avg       0.87      0.86      0.87      4800
weighted avg       0.87      0.86      0.86      4800



### attempt 3
* Kmeans clustering

In [39]:
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans

In [55]:
list(X_test)[1]

'GOTOTOP Classical Retro Cotton & PU Leather Neck Shoulder Strap Anti-Slip for SLR DSLR Cameras (Charcoal Grey) Colour:Charcoal Grey   Specifications: Material: Cotton + PU Leather  Color: Charcoal Grey, Brown (as show in the pictures)  Weight: approx. 40g  Main Belt Length: approx. 70cm/27.55"  Width: approx. 3.5cm/1.37"  Fit For: All kind of brand SLR cameras, Part of micro single cameras  Package Included: 1 x camera shoulder neck strap (The camera is not included)'