In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from sklearn.externals import joblib
import re

data = pd.read_csv("train.csv")

def newCat(x):
    if x=='m':
        return 2
    elif x=='f':
        return 1
    else:
        return 0

stop_words = set(stopwords.words('english', 'indonesia'))
data['NewCat'] = data.image_path.copy()
data.NewCat = data.NewCat.apply(lambda x: newCat(x[0]))
data.Category = data.Category.astype('category')

fashion_df = data[data.NewCat==1]
stop_words = set(stopwords.words('english', 'indonesia'))
trainset, valset = train_test_split(fashion_df, test_size=0.3, random_state=1)
fashion_x_train = trainset.title
y_train = trainset.Category
fashion_x_val = valset.title
y_val = valset.Category
n_gram=(1,1)
vectorizer=TfidfVectorizer(stop_words=stop_words, #max_features=vocab_size, 
                           preprocessor=lambda x: re.sub(r'(\d[\d\.])+', 'NUM', x.lower()),
                           binary=True, ngram_range=n_gram)
x_train = vectorizer.fit_transform(fashion_x_train)
x_val = vectorizer.transform(fashion_x_val)

In [4]:
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

rf = RandomForestClassifier(random_state=1, n_estimators=100)
rf.fit(x_train, y_train)
bc = BaggingClassifier(random_state=1, n_estimators=100)
bc.fit(x_train, y_train)
gc = GradientBoostingClassifier(random_state=1, n_estimators=100)
gc.fit(x_train, y_train)
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
lg = LogisticRegression(random_state=1, max_iter=100, multi_class='auto')
lg.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='auto',
          n_jobs=None, penalty='l2', random_state=1, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [5]:
predictions_rf = rf.predict(x_val)
print(accuracy_score(predictions_rf, y_val))
predictions_bc = bc.predict(x_val)
print(accuracy_score(predictions_bc, y_val))
predictions_gc = gc.predict(x_val)
print(accuracy_score(predictions_gc, y_val))
predictions_knn = knn.predict(x_val)
print(accuracy_score(predictions_knn, y_val))
predictions_lg = lg.predict(x_val)
print(accuracy_score(predictions_lg, y_val))

0.6411979790930193
0.6367222466659587
0.590963572089636
0.5472227700990729
0.6025853044256649


In [6]:
vc = VotingClassifier(estimators=[('bc',bc),('rf',rf),('lg',lg),('gc',gc),('knn',knn)])
vc.fit(x_train, y_train)
print(accuracy_score(vc.predict(x_val), y_val))



0.6413496988363095


In [8]:
test_data = pd.read_csv("test.csv")
test_data['NewCat'] = test_data.image_path.copy()
test_data.NewCat = test_data.NewCat.apply(lambda x: newCat(x[0]))
fashion_test = test_data[test_data.NewCat==1]
x_test = vectorizer.transform(fashion_test.title)

In [9]:
test_preds = vc.predict(x_test)
output = pd.DataFrame({'itemid': fashion_test.itemid,
                      'Category': test_preds})
output.to_csv('fashion.csv', index=False)