In [47]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from sklearn.externals import joblib
import re

data = pd.read_csv("train.csv")

def newCat(x):
    if x=='m':
        return 2
    elif x=='f':
        return 1
    else:
        return 0

stop_words = set(stopwords.words('english', 'indonesia'))
data['NewCat'] = data.image_path.copy()
data.NewCat = data.NewCat.apply(lambda x: newCat(x[0]))
data.Category = data.Category.astype('category')

In [62]:
beauty_df = data[data.NewCat==0]
trainset, beauty_val = train_test_split(beauty_df, test_size=0.3, random_state=1, stratify=beauty_df.Category)
beauty_x_train = trainset.title
beauty_y_train = trainset.Category
beauty_vec=TfidfVectorizer(stop_words=stop_words, #max_features=1450, 
                           preprocessor=lambda x: re.sub(r'(\d[\d\.])+', 'NUM', x.lower()),
                           binary=False, ngram_range=(1,1))
beauty_x_train = beauty_vec.fit_transform(beauty_x_train)
beauty_x_val = beauty_vec.transform(beauty_val.title)
del beauty_df
print(beauty_x_train.shape)

(200608, 19260)


In [63]:
fashion_df = data[data.NewCat==1]
trainset, fashion_val = train_test_split(fashion_df, test_size=0.3, random_state=1, stratify=fashion_df.Category)
fashion_x_train = trainset.title
fashion_y_train = trainset.Category
fashion_vec=TfidfVectorizer(stop_words=stop_words, #max_features=1450, 
                           preprocessor=lambda x: re.sub(r'(\d[\d\.])+', 'NUM', x.lower()),
                           binary=False, ngram_range=(1,1))
fashion_x_train = fashion_vec.fit_transform(fashion_x_train)
fashion_x_val = fashion_vec.transform(fashion_val.title)
del fashion_df
print(fashion_x_train.shape)

(153791, 22133)


In [64]:
mobile_df = data[data.NewCat==2]
trainset, mobile_val = train_test_split(mobile_df, test_size=0.3, random_state=1, stratify=mobile_df.Category)
mobile_x_train = trainset.title
mobile_y_train = trainset.Category
mobile_vec=TfidfVectorizer(stop_words=stop_words, #max_features=1450, 
                           preprocessor=lambda x: re.sub(r'(\d[\d\.])+', 'NUM', x.lower()),
                           binary=False, ngram_range=(1,1))
mobile_x_train = mobile_vec.fit_transform(mobile_x_train)
mobile_x_val = mobile_vec.transform(mobile_val.title)
del mobile_df, trainset
print(mobile_x_train.shape)

(112231, 14868)


## Training

In [65]:
from sklearn.ensemble import RandomForestClassifier
beauty_rf = RandomForestClassifier(random_state=1, n_jobs=-1, n_estimators=3000)
fashion_rf = RandomForestClassifier(random_state=1, n_jobs=-1, n_estimators=3000)
mobile_rf = RandomForestClassifier(random_state=1, n_jobs=-1, n_estimators=2000)

In [66]:
beauty_rf.fit(beauty_x_train, beauty_y_train)
#joblib.dump(beauty_rf, "beauty_model.sav")
fashion_rf.fit(fashion_x_train, fashion_y_train)
#joblib.dump(fashion_rf, "fashion_model.sav")
mobile_rf.fit(mobile_x_train, mobile_y_train)
#joblib.dump(mobile_rf, "mobile_model.sav")

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

## Check Validation Scores

In [67]:
predict_beauty = beauty_rf.predict(beauty_x_val)
predict_fashion = fashion_rf.predict(fashion_x_val)
predict_mobile = mobile_rf.predict(mobile_x_val)

In [68]:
print(accuracy_score(predict_beauty, beauty_val.Category))
print(accuracy_score(predict_fashion, fashion_val.Category))
print(accuracy_score(predict_mobile, mobile_val.Category))

0.7711078801977319
0.6435496351140174
0.8071685482026654


In [26]:
testdata = pd.read_csv("test.csv")
testdata['NewCat'] = testdata.image_path.copy()
testdata['NewCat'] = testdata['NewCat'].apply(lambda x: newCat(x[0]))

beauty_test = testdata[testdata.NewCat==0]
fashion_test = testdata[testdata.NewCat==1]
mobile_test = testdata[testdata.NewCat==2]
    
beauty_x_test = beauty_vec.transform(beauty_test.title)
fashion_x_test = fashion_vec.transform(fashion_test.title)
mobile_x_test = mobile_vec.transform(mobile_test.title)

In [27]:
predict_beauty = beauty_rf.predict(beauty_x_test)
predict_fashion = fashion_rf.predict(fashion_x_test)
predict_mobile = mobile_rf.predict(mobile_x_test)
temp1 = pd.DataFrame({'itemid':beauty_test.itemid, 'predictions':predict_beauty})
temp2 = pd.DataFrame({'itemid':fashion_test.itemid, 'predictions':predict_fashion})
temp3 = pd.DataFrame({'itemid':mobile_test.itemid, 'predictions':predict_mobile})
temp = temp1.append(temp2)
temp = temp.append(temp3)
output = temp.merge(testdata, how='inner')
output.to_csv('submission3.csv')