In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
import re

In [2]:
data = pd.read_csv("train.csv")

## Feature Engineering {0: Beauty, 1: Fashion, 2: Mobile}

In [3]:
def newCat(x):
    if x == 'm':
        return 2
    elif x == 'f':
        return 1
    else:
        return 0

data['NewCat'] = data.image_path.copy()
data.NewCat = data.NewCat.apply(lambda x: newCat(x[0]))

## To Categorical and Label in Binary format

In [4]:
data['Category'] = data['Category'].astype('category')

## Splitting DataFrame into Mobile

In [5]:
mobile_df = data[data.NewCat==2]
del data

## Stopwords and Tokenize

In [6]:
stop_words = set(stopwords.words('english', 'indonesia'))

## Train Test Split

In [7]:
trainset, valset = train_test_split(mobile_df, test_size=0.3, random_state=1, stratify=mobile_df.Category)

## Model

In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier

rf = RandomForestClassifier(random_state=1, n_estimators=100)
bc = RandomForestClassifier(random_state=1, n_estimators=100)
gc = GradientBoostingClassifier(random_state=1, n_estimators=100)
knn = KNeighborsClassifier()
lg = LogisticRegression(random_state=1, max_iter=100, multi_class='auto')

vectorizer = TfidfVectorizer(stop_words=stop_words, binary=False, ngram_range=(1,1),
                                 preprocessor=lambda x: re.sub(r'(\d[\d\.])+', 'NUM', x.lower()))
x_train = vectorizer.fit_transform(trainset.title)

In [25]:
rf.fit(x_train, trainset.Category)
bc.fit(x_train, trainset.Category)
gc.fit(x_train, trainset.Category)
knn.fit(x_train, trainset.Category)
lg.fit(x_train, trainset.Category)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='auto',
          n_jobs=None, penalty='l2', random_state=1, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [26]:
x_val = vectorizer.transform(valset.title)
print(accuracy_score(rf.predict(x_val), valset.Category))
print(accuracy_score(bc.predict(x_val), valset.Category))
print(accuracy_score(gc.predict(x_val), valset.Category))
print(accuracy_score(knn.predict(x_val), valset.Category))
print(accuracy_score(lg.predict(x_val), valset.Category))

0.8063369300817065
0.8063369300817065
0.7954843136031934
0.7542568452566581
0.7944655814050188


In [27]:
vc = VotingClassifier(estimators=[('rf', rf), ('bc', bc), ('gc', gc), ('knn', knn), ('lg', lg)])
vc.fit(x_train, trainset.Category)



VotingClassifier(estimators=[('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_we...e, penalty='l2', random_state=1, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))],
         flatten_transform=None, n_jobs=None, voting='hard', weights=None)

In [28]:
print(accuracy_score(vc.predict(x_val), valset.Category))

0.80808332813572


In [29]:
test_data = pd.read_csv("test.csv")
test_data['NewCat'] = test_data.image_path.copy()
test_data.NewCat = test_data.NewCat.apply(lambda x: newCat(x[0]))
mobile_test = test_data[test_data.NewCat==2]
x_test = vectorizer.transform(mobile_test['title'])

In [35]:
test_predictions = vc.predict(x_test)
output = pd.DataFrame({'itemid': mobile_test['itemid'],
                       'Category': test_predictions})
output.to_csv('mobile.csv', index=False)