In [8]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, top_k_accuracy_score
from sklearn.pipeline import Pipeline

In [2]:
import gcsfs

In [3]:
PROJECT_ID = 'groupby-development'

In [4]:
fs = gcsfs.GCSFileSystem(project=PROJECT_ID)

In [5]:
label_mapping='gs://gbi_ml/classification_hackathon/label2id.pickle'

with fs.open(label_mapping, 'rb') as handle:
    label2id = pickle.load(handle)

id_mapping='gs://gbi_ml/classification_hackathon/id2label.pickle'
with fs.open(id_mapping, 'rb') as handle:
    id2label = pickle.load(handle)
    
from collections import defaultdict
  
def def_value():
    return -1

# Defining the dict
d = defaultdict(int, label2id)

In [6]:
base_dir = 'gs://gbi_ml/classification_hackathon/'
train_path = 'gs://gbi_ml/classification_hackathon/bbby_train_new.csv'
test_path = 'gs://gbi_ml/classification_hackathon/bbby_test_new.csv'
train_path = 'gs://gbi_ml/classification_hackathon/bbby_train_ready.csv' #with labels
test_path = 'gs://gbi_ml/classification_hackathon/bbby_test_ready.csv' # with labels

df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
df.head()

Unnamed: 0,bucket_id,external_id,bucket_name,product_id,raw_product_name,raw_product_description,label
0,6597,200100276,Traps,1588676851,"Evergreen Fruit Fly Trap, Red","Evergreen Fruit Fly Trap, Red You'll be able t...",613
1,6597,200100281,Traps,1588676855,"Evergreen Fruit Fly Trap, Green","Evergreen Fruit Fly Trap, Green You'll be able...",613
2,6597,200200086,Traps,1751333665,Evertone HomeMax Chemical-Free Mosquito Killer,Tired of flying insects in your space? The mos...,613
3,6597,200207644,Traps,1872638801,"Woodstream Terro (#T300) Liquid Ant Bait, Pre-...","Terro (#T300) Liquid Ant Bait, Pre-filled, RTU...",613
4,6597,200207650,Traps,1949868418,Woodstream Safer Japanese Beetle Trap w/ Bag &...,Safer Japanese Beetle Trap w/ Bag & Bait Japan...,613


In [23]:
# df['label'] = df.bucket_name.apply(lambda x: label2id[x])

# df[
#     ['bucket_id', 'external_id', 'bucket_name','product_id','raw_product_name', 'raw_product_description', 'label']
# ].to_csv('gs://gbi_ml/classification_hackathon/bbby_train_ready.csv', index=False)

# test_df = pd.read_csv(test_path)
# test_df['label'] = test_df['Manual Classification Bucket'].apply(lambda x: d[x])

# test_df.rename(columns={'Manual Classification Bucket':'bucket_name', 'External ID':'external_id'})[
#     ['external_id', 'bucket_name','raw_product_name', 'raw_product_description', 'label']].to_csv('gs://gbi_ml/classification_hackathon/bbby_test_ready.csv', index=False)

In [52]:
filtered = df.groupby('label').head(30)

In [None]:
filtered = filtered.sample(frac=1, random_state=42)

In [53]:
filtered.shape

(19951, 13)

In [54]:
train_df = filtered

In [9]:
pipe = Pipeline([('vectorizer', TfidfVectorizer()), ('classifier', MultinomialNB())])

In [61]:
# # for testing vectorizer
# tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
# df_features = tfidf.fit_transform(train_df.raw_product_description).astype(np.float32).toarray()
# clf = MultinomialNB().fit(df_features, df.label[:1000])
# clf.predict(df_features[-10:])

In [10]:
pipe.fit(df.raw_product_description, df.label)

Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                ('classifier', MultinomialNB())])

In [64]:
pipe.predict(train_df.raw_product_description[:100])

array([613, 231, 613, 613, 613, 613, 613, 613, 613, 613, 613, 613, 613,
       613, 613, 613, 613, 613, 613, 613, 613, 613, 613, 613, 613, 613,
       613, 613, 613, 613, 632, 632, 632, 632, 632, 632, 632, 632, 632,
       632, 632, 632, 632, 632, 632, 632, 632, 632, 632, 632, 632, 632,
       632, 632, 493, 632, 632, 632, 632, 632, 491,  22,  22, 574,  22,
        22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,
        22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  82,
        82, 491,  82,  82,  82,  82,  82,  82,  82])

In [12]:
accuracy_score(pipe.predict(df.raw_product_description), df.label)

0.7940859726745468

In [13]:
accuracy_score(pipe.predict(test_df.raw_product_description), test_df.label)

0.49766439094502335

In [18]:
labels = list(id2label.keys())

In [20]:
top_k_accuracy_score(test_df.label, pipe.predict_proba(test_df.raw_product_description), k=3, labels=labels)

0.6614136851291001