In [1]:
from __future__ import division
import numpy as np
import xgboost as xgb
import pandas as pd



In [2]:
mnist = pd.read_csv('dataset/train.csv')

In [3]:
mnist.head(5)

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
mnist_class = mnist['label'].values

In [5]:
mnist_class.shape

(42000,)

In [6]:
mnist_data = mnist.drop('label', axis=1).values

In [7]:
mnist_data.shape

(42000, 784)

In [8]:
from sklearn.model_selection import train_test_split 

In [9]:
X_train, X_test, y_train, y_test = train_test_split(mnist_data, mnist_class,train_size=0.75, test_size=0.25)

In [10]:
# setup parameters for xgboost
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softmax'
# scale weight of positive examples
param['eta'] = 0.1
param['max_depth'] = 6
param['silent'] = 1
param['nthread'] = 4
param['num_class'] = 10
param['eval_metric'] = "mlogloss"
param['eval_metric'] = "merror"

In [11]:
y_train.size, X_train.shape

(31500, (31500, 784))

In [12]:
xg_train = xgb.DMatrix(X_train, label = y_train)

In [13]:
xg_test = xgb.DMatrix(X_test, label = y_test)

In [14]:
watchlist = [(xg_train, 'train'), (xg_test, 'test')]
num_round = 150
bst = xgb.train(param, xg_train, num_round, watchlist)

[0]	train-merror:0.133175	test-merror:0.159048
[1]	train-merror:0.105937	test-merror:0.126571
[2]	train-merror:0.088413	test-merror:0.112
[3]	train-merror:0.080317	test-merror:0.102571
[4]	train-merror:0.074381	test-merror:0.09619
[5]	train-merror:0.069968	test-merror:0.093524
[6]	train-merror:0.066571	test-merror:0.09019
[7]	train-merror:0.063016	test-merror:0.08819
[8]	train-merror:0.060476	test-merror:0.085143
[9]	train-merror:0.057524	test-merror:0.082571
[10]	train-merror:0.054889	test-merror:0.081048
[11]	train-merror:0.052762	test-merror:0.078476
[12]	train-merror:0.050476	test-merror:0.077238
[13]	train-merror:0.048984	test-merror:0.075714
[14]	train-merror:0.047587	test-merror:0.073048
[15]	train-merror:0.045429	test-merror:0.07181
[16]	train-merror:0.044032	test-merror:0.070571
[17]	train-merror:0.041905	test-merror:0.069143
[18]	train-merror:0.040825	test-merror:0.068095
[19]	train-merror:0.039302	test-merror:0.067429
[20]	train-merror:0.037778	test-merror:0.066286
[21]	trai

In [15]:
mnist_submit = pd.read_csv('dataset/test.csv')
mnist_submit.head(5)

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
mnist_submit_data = mnist_submit.values

In [17]:
mnist_submit_data.shape
dtest = xgb.DMatrix(mnist_submit_data)

In [18]:
submission_pred = bst.predict(dtest)

In [19]:
submission_pred.shape

(28000,)

In [20]:
id = mnist_submit.index.values + 1


In [21]:
final = pd.DataFrame({'ImageId': id, 'Label': submission_pred.astype(int)})

In [22]:
final.head(5)

Unnamed: 0,ImageId,Label
0,1,2
1,2,0
2,3,9
3,4,7
4,5,3


In [23]:
final.shape

(28000, 2)

In [24]:
final.to_csv('dataset/submission.csv', index = False)

In [25]:
mnist_submit.shape

(28000, 784)

In [26]:
submission_pred[:10]

array([ 2.,  0.,  9.,  7.,  3.,  7.,  0.,  3.,  0.,  3.], dtype=float32)

In [27]:
xgb.plot_importance(bst)

<matplotlib.axes._subplots.AxesSubplot at 0x7fbb88d15c50>

In [29]:
bst.save_model('xgboost-mnist-v1.3')