In [6]:
import numpy as np 
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [7]:
df = pd.read_csv('whole_df.csv')

In [10]:
# Here I downsampled the data to have balanced classes
from sklearn.utils import shuffle

shuffled = df
clothing = shuffled[shuffled['type'] == 'clothing'].sample(n=9000)
shoes = pd.read_csv('shoes.csv', names=['name', 'type']).sample(n=9000)
beauty = shuffled[shuffled['type'] == 'beauty'].sample(n=9000)
accessories = shuffled[shuffled['type'] == 'accessories']
watches = pd.read_csv('watches.csv', names=['name', 'type']).sample(n=9000)
#pet = shuffled[shuffled['type'] == 'pet supplies'].sample(n=5000)
tools = shuffled[shuffled['type'] == 'tools'].sample(n=9000)
houseware = shuffled[shuffled['type'] == 'houseware'].sample(n=9000)

concated = pd.concat([clothing, shoes, accessories, beauty,
                      watches, tools, houseware], ignore_index=True)
# Shuffle the dataset
concated = shuffle(concated)
concated = concated[['name', 'type']].apply(lambda x: x.astype(str).str.lower())

In [11]:
concated.type.value_counts()

beauty         9000
houseware      9000
watches        9000
clothing       9000
tools          9000
shoes          9000
accessories    8120
Name: type, dtype: int64

In [12]:
concated.isna().sum()

name    0
type    0
dtype: int64

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, 
                        min_df=5, norm='l2', 
                        ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(concated[:40000].name).toarray()
features.shape

(40000, 13940)

In [22]:
labels = concated[:40000].type

In [23]:
X = features
y = labels

In [24]:
X.shape

(40000, 13940)

In [25]:
#Dimenionality reduction. Only using the 100 best features per category
from sklearn.decomposition import PCA
pca = PCA(n_components= 100,random_state=3)
X = pca.fit_transform(X)
X.shape

(40000, 100)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier

'''Metrics/Evaluation'''
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix


models = [RandomForestClassifier(n_estimators=200, random_state=0),
              XGBClassifier(),
              LogisticRegression(random_state=0),
              MultinomialNB(),
              KNeighborsClassifier(),
              LinearSVC()]
CV = 3
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, features[:1000], labels[:1000], scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

              
sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()

recommend you that don't use Naive Bayes with SVD or other matrix factorization because Naive Bayes based on applying Bayes' theorem with strong (naive) independence assumptions between the features. Use other classifier, for example

In [26]:
# Creating a dict of the models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix



model_dict = {'Random Forest': RandomForestClassifier(n_estimators=200, random_state=0),
              'XGBClassifier': XGBClassifier(),
              'LogisticRegression': LogisticRegression(random_state=0),
              'K Nearest Neighbor': KNeighborsClassifier(),
              'linearSVM': LinearSVC()}

# Train test split 
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=.2,
                                                    random_state=3)

# Function to get the scores for each model in a df
def model_score_df(model_dict):
    model_name, ac_score_list, p_score_list, r_score_list, f1_score_list = [], [], [], [], []
    for k, v in model_dict.items():
        model_name.append(k)
        v.fit(X_train, y_train)
        y_pred = v.predict(X_test)
        ac_score_list.append(accuracy_score(y_test, y_pred))
        p_score_list.append(precision_score(y_test, y_pred, average='macro'))
        r_score_list.append(recall_score(y_test, y_pred, average='macro'))
        f1_score_list.append(f1_score(y_test, y_pred, average='macro'))
        model_comparison_df = pd.DataFrame(
            [model_name, ac_score_list, p_score_list, r_score_list, f1_score_list]).T
        model_comparison_df.columns = [
            'model_name', 'accuracy_score', 'precision_score', 'recall_score', 'f1_score']
        model_comparison_df = model_comparison_df.sort_values(
            by='f1_score', ascending=False)
    return model_comparison_df


model_score_df(model_dict)

Unnamed: 0,model_name,accuracy_score,precision_score,recall_score,f1_score
0,Random Forest,0.806375,0.805148,0.804082,0.804047
1,XGBClassifier,0.77125,0.770777,0.768726,0.76865
4,linearSVM,0.77,0.767639,0.768013,0.765656
2,LogisticRegression,0.764375,0.765191,0.762002,0.761903
3,K Nearest Neighbor,0.751125,0.751922,0.749633,0.750553


In [27]:
names = pd.read_csv('ecommerce_product_names.csv')
names = names.apply(lambda x: x.astype(str).str.lower())

In [28]:
X_test = tfidf.fit_transform(names['Product Name']).toarray()
X_test = pca.fit_transform(X_test)

In [29]:
model = RandomForestClassifier()
model.fit(X, y)
y_pred = model.predict(X_test)

In [32]:
model2 = LinearSVC()
model2.fit(X, y)
y_pred2 = model2.predict(X_test)

In [30]:
names['prediction'] = y_pred

In [33]:
names['prediction_2'] = y_pred2

In [20]:
names.to_csv('pred_ml.csv', index = None)

In [34]:
names[80:]

Unnamed: 0,Product Name,prediction,prediction_2
80,t star uft-tsw-005-bk-br analog watch - for boys,tools,houseware
81,rialto boots,shoes,beauty
82,kielz ladies boots,shoes,beauty
83,"alfajr wy16b youth digital watch - for men, boys",accessories,houseware
84,la briza ashley boots,accessories,beauty
85,tag heuer cau1116.ba0858 formula 1 analog watc...,houseware,houseware
86,salt n pepper 13-019 femme black boots boots,accessories,beauty
87,shuz touch boots,clothing,beauty
88,wrangler skanders fit men's jeans,watches,houseware
89,salt n pepper 14-664 denny black boots boots,shoes,beauty
