# Proof-of-Concept Model

In [1]:
import pandas as pd
import numpy as np

## Read data and show sample

In [2]:
df = pd.read_csv("../data/raw/Test_Pandas.csv")
df.head()

Unnamed: 0,itemid,shopid,item_name,item_description,item_variation,price,stock,category,cb_option,is_preferred,sold_count,item_creation_date
0,1925574,210000,Golden mobile numbers,Unregistered prepaid cards1) 93355333 selling...,{},400.0,1,Mobile & Gadgets,0,0,0,1/10/15 0:24
1,1925617,210000,Golden mobile numbers,Unregistered prepaid cards1) 93355888 selling...,{},400.0,1,Mobile & Gadgets,0,0,0,1/10/15 0:26
2,943600,210000,Golden Mobile Numbers,Unregistered prepaid cards. Can port to post p...,{},8.0,1,Mobile & Gadgets,0,0,0,25/8/15 21:02
3,1064405,210000,Golden Mobile Numbers,Unregistered prepaid cards. Can port to post p...,{},8.0,1,Mobile & Gadgets,0,0,0,30/8/15 20:16
4,20046620,760000,101% AUTHENTIC BASEBALL CAPS,"PREORDER Takes about 23 weeks to arrive, will ...","{NEWERA BLACK: 35.0, NIKE SWOOSH DENIM: 35.0, ...",35.0,300,Women's Apparel,0,0,0,9/5/16 1:14


In [3]:
for item in df.iloc[4, :].to_list():
    print(item, '\n')

20046620 

760000 

101% AUTHENTIC BASEBALL CAPS 

PREORDER Takes about 23 weeks to arrive, will provide receipt of order & estimated date of arrival upon confirmationAny other designs youre looking for feel free to ask! Ill help you check the website to see if i am able to ship it for you (: Sold more than 10 caps with good reviews in Carousell, 100% authentic and can be trusted. From a denmark store which is an official rattle location for yeezyboost Check listings for other authentic baseball caps too! #Nike #baseball #baseballcap #newera #neweracap #authentic #swoosh #newyork #yankee  

{NEWERA BLACK: 35.0, NIKE SWOOSH DENIM: 35.0, NIKE SWOOSH BLACK: 35.0, NIKE SMALL SWOOSH: 35.0, NEWERA WHITE: 35.0, NEWERA MAROON: 35.0} 

35.0 

300 

Women's Apparel 

0 

0 

0 

9/5/16 1:14 



## Get rid of NaN values

In [4]:
df.loc[df['item_name'].isna(), 'item_name'] = ""
df.loc[df['item_description'].isna(), 'item_description'] = ""
df.loc[df['item_variation'].isna(), 'item_variation'] = ""
df.dropna(inplace=True, subset=['category'])

## Scale continuous values (price and stock)

In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(df[['price', 'stock']].to_numpy())
df[['price', 'stock']] = scaler.transform(df[['price', 'stock']].to_numpy())

In [6]:
import pickle

with open('../scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

## Break down item creation date

In [7]:
df['item_creation_date'] = pd.to_datetime(df['item_creation_date'])
df['year'] = df['item_creation_date'].dt.year
df['month'] = df['item_creation_date'].dt.month
df['day'] = df['item_creation_date'].dt.day
df.drop(columns=['item_creation_date'], inplace=True)

## Assign labels to the target feature (category)

In [8]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(df['category'].to_numpy())
df['category'] = encoder.transform(df['category'].to_numpy())

with open('../encoder.pkl', 'wb') as file:
    pickle.dump(encoder, file)

## Prepare X and y datasets

In [9]:
numerical_features = [
    'itemid',
    'shopid',
    'price',
    'stock',
    'cb_option',
    'is_preferred',
    'sold_count',
    'year',
    'month',
    'day'
]

X = df[numerical_features].to_numpy()
y = df['category'].to_numpy()

## Vectorize text with TF-IDF

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

def vectorize_text(name, text, max_features=None):
    vectorizer = TfidfVectorizer(max_features=max_features)
    vectorizer.fit(text)
    with open(f'../vectorizer_{name}.pkl', 'wb') as file:
            pickle.dump(vectorizer, file)
    return vectorizer.transform(text).toarray()

In [15]:
X = np.concatenate(
    (
        X,
        vectorize_text('item_name', df['item_name'], max_features=100),
        vectorize_text('item_description', df['item_description'], max_features=100),
        vectorize_text('item_variation', df['item_variation'], max_features=100)
    ),
    axis=1
)

## Train POC model (Decision Tree)

In [46]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = DecisionTreeClassifier().fit(X_train, y_train)

## Evaluate Results

In [47]:
# print classfication report
from sklearn.metrics import classification_report
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.94      0.94      5415
           1       0.79      0.78      0.79       791
           2       0.74      0.78      0.76      1107
           3       0.76      0.63      0.69       122
           4       0.77      0.72      0.74      1366
           5       0.90      0.89      0.89      5503
           6       0.83      0.84      0.83      5997
           7       0.66      0.68      0.67       500
           8       0.95      0.95      0.95     10889
           9       0.97      0.97      0.97      4861
          10       0.97      0.97      0.97     10655
          11       0.75      0.76      0.75       511
          12       0.97      0.97      0.97     12438
          13       0.78      0.74      0.76       384
          14       0.79      0.75      0.77      1592
          15       0.57      0.29      0.38        28
          16       0.89      0.89      0.89      5982
          17       0.97    

**Conclusion**  
Decision tree as a POC model has demonstrated a promising performance on the testing dataset: most of the classes have more than 0.5 F1 score. It means that it makes sense to continue the project and build a more advanced model powered by better data preparation and modeling. 