In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

In [None]:
df = pd.read_csv("Ecommerce_data.csv")

In [None]:
df.head()

Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories


In [None]:
df.label.value_counts()

Household                 6000
Electronics               6000
Clothing & Accessories    6000
Books                     6000
Name: label, dtype: int64

In [None]:
df['label_no'] = df.label.map({
    'Household':0,
    'Electronics':1,
    'Clothing & Accessories':2,
    'Books':3
})

In [None]:
df.head()

Unnamed: 0,Text,label,label_no
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,0
1,"Contrast living Wooden Decorative Box,Painted ...",Household,0
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,1
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,2
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,2


In [None]:
v = TfidfVectorizer()

In [None]:
df.Text[:4]

0    Urban Ladder Eisner Low Back Study-Office Comp...
1    Contrast living Wooden Decorative Box,Painted ...
2    IO Crest SY-PCI40010 PCI RAID Host Controller ...
3    ISAKAA Baby Socks from Just Born to 8 Years- P...
Name: Text, dtype: object

In [None]:
df.label[:4] , df.label_no[:4]

(0                 Household
 1                 Household
 2               Electronics
 3    Clothing & Accessories
 Name: label, dtype: object,
 0    0
 1    0
 2    1
 3    2
 Name: label_no, dtype: int64)

In [None]:
x_train,x_test,y_train,y_test = train_test_split(df.Text,df.label_no,
                                                test_size = 0.2, random_state = 42,
                                                stratify= df.label_no)

In [None]:
tfv = TfidfVectorizer()

In [None]:
x_train_tfv = tfv.fit_transform(x_train)

In [None]:
type(x_train_tfv)

scipy.sparse._csr.csr_matrix

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=5)

In [None]:
knn_model.fit(x_train_tfv,y_train)

In [None]:
x_test[:4]

10572    Deco Home Cotton Modern Arabesque Bedding, Kin...
23834    Sky Tech® High Speed External Memory Card Read...
13988    Allen Solly Men's Cotton Handkerchief Navy/Sky...
10777    Samriddhi Artificial Leaves Garlands/Creepers ...
Name: Text, dtype: object

In [None]:
x_test_tfv = v.fit_transform(x_test)


In [None]:
x_test_tfv.shape

(4800, 27377)

In [None]:
x_train_tfv.shape

(19200, 47298)

In [None]:
md_model = MultinomialNB()

In [None]:
md_model.fit(x_train_tfv,y_train)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [None]:
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

def train_and_evaluate_model(pipeline_name, x_train, x_test, y_train, y_test, vectorizer, model_name, show_classification_report=True):
    model = Pipeline([
        ('Vectorizer', vectorizer()),
        ('model_name', model_name())
    ])

    model.fit(x_train, y_train)
    model_prediction = None
    if show_classification_report:
        model_predictions = model.predict(x_test)
        score = classification_report(y_test, model_predictions)
        print(score)
    else:
        score = None

    return model, score , model_predictions


In [None]:
knn = KNeighborsClassifier()

In [None]:
knn_model , knn_score , knn_prediction = train_and_evaluate_model(knn,x_train,x_test,y_train,y_test,TfidfVectorizer,KNeighborsClassifier)

              precision    recall  f1-score   support

           0       0.94      0.96      0.95      1200
           1       0.97      0.96      0.96      1200
           2       0.98      0.98      0.98      1200
           3       0.98      0.96      0.97      1200

    accuracy                           0.97      4800
   macro avg       0.97      0.97      0.97      4800
weighted avg       0.97      0.97      0.97      4800



In [None]:
mnm = MultinomialNB()

In [None]:
mnm_model,mnm_score, mnm_prediction = train_and_evaluate_model(mnm,x_train,x_test,y_train,y_test,TfidfVectorizer,MultinomialNB)

              precision    recall  f1-score   support

           0       0.93      0.97      0.95      1200
           1       0.97      0.97      0.97      1200
           2       0.98      0.98      0.98      1200
           3       0.99      0.94      0.96      1200

    accuracy                           0.96      4800
   macro avg       0.97      0.96      0.96      4800
weighted avg       0.97      0.96      0.96      4800



In [None]:
x_test[:5],y_test[:5]

(10572    Deco Home Cotton Modern Arabesque Bedding, Kin...
 23834    Sky Tech® High Speed External Memory Card Read...
 13988    Allen Solly Men's Cotton Handkerchief Navy/Sky...
 10777    Samriddhi Artificial Leaves Garlands/Creepers ...
 11896    Seagate 2TB Backup Plus Slim (Blue) USB 3.0 Ex...
 Name: Text, dtype: object,
 10572    0
 23834    1
 13988    2
 10777    0
 11896    1
 Name: label_no, dtype: int64)

In [None]:
knn_prediction[:5]

array([0, 1, 2, 0, 1], dtype=int64)

In [None]:
mnm_prediction[:5]

array([0, 1, 2, 0, 1], dtype=int64)