In [33]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.utils import shuffle
import time

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB

from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier

### Data extraction using pandas

In [34]:
X_train = pd.read_csv('train_data.csv')
X_train.head(10)

Unnamed: 0,m_label,strength,italic,m_top,m_left,originalH,originalW,r0c0,r0c1,r0c2,...,r19c10,r19c11,r19c12,r19c13,r19c14,r19c15,r19c16,r19c17,r19c18,r19c19
0,8710,0.4,1,35,21,46,39,1,1,1,...,255,255,255,255,255,255,255,255,255,182
1,99,0.4,0,48,23,33,25,1,1,1,...,255,255,225,101,41,1,1,1,1,1
2,835,0.4,0,33,16,13,8,1,1,128,...,1,1,1,1,1,1,1,1,1,1
3,48,0.4,0,0,0,18,11,27,31,49,...,249,247,244,236,212,184,127,94,41,29
4,3653,0.4,0,46,21,50,23,1,1,1,...,1,1,1,1,56,255,255,255,255,255
5,697,0.4,0,37,25,16,7,255,255,255,...,255,255,255,255,73,1,1,1,1,1
6,52,0.4,0,0,0,13,11,40,40,40,...,222,218,222,233,240,249,240,236,157,139
7,195,0.4,1,34,21,60,58,1,1,1,...,1,1,150,255,255,255,255,238,1,1
8,7534,0.4,0,37,19,47,26,1,1,1,...,255,255,229,184,1,1,1,1,1,1
9,653,0.7,1,53,20,31,43,1,1,1,...,255,161,1,1,1,1,1,202,255,255


In [35]:
y_train = pd.read_csv('train_labels.csv')
y_train.head(10)

Unnamed: 0,Font
0,ARIAL
1,TIMES
2,TIMES
3,ARIAL
4,SERIF
5,CAMBRIA
6,ARIAL
7,ARIAL
8,CAMBRIA
9,CALIBRI


In [36]:
X_test = pd.read_csv('test_data.csv')
X_test.head(10)

Unnamed: 0,m_label,strength,italic,m_top,m_left,originalH,originalW,r0c0,r0c1,r0c2,...,r19c10,r19c11,r19c12,r19c13,r19c14,r19c15,r19c16,r19c17,r19c18,r19c19
0,49,0.4,0,0,0,36,36,1,1,1,...,114,240,255,255,255,255,255,224,114,51
1,8453,0.7,1,33,25,50,55,1,1,52,...,1,1,14,107,154,154,89,24,1,1
2,64421,0.7,1,32,31,55,39,1,1,1,...,1,1,49,132,222,255,255,195,1,1
3,1704,0.7,0,29,24,70,58,1,1,1,...,255,235,183,117,47,7,1,1,1,1
4,1102,0.4,0,49,24,39,45,125,125,83,...,125,241,255,255,212,125,1,1,1,1
5,598,0.7,0,39,22,57,35,1,1,1,...,1,1,1,22,140,217,255,255,255,204
6,722,0.7,1,64,28,15,7,255,255,255,...,1,1,1,1,1,1,1,1,1,1
7,57,0.4,0,0,0,14,8,1,1,8,...,1,1,1,1,1,20,20,10,1,1
8,65236,0.4,0,31,18,39,20,1,1,1,...,255,255,255,255,255,255,255,255,255,255
9,7811,0.4,0,35,20,46,47,1,1,1,...,1,1,1,56,227,11,1,1,1,1


In [37]:
X_test.shape

(29221, 407)

In [38]:
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
print(X_train)
print(y_train)
print(X_test.shape)

[[8.710e+03 4.000e-01 1.000e+00 ... 2.550e+02 2.550e+02 1.820e+02]
 [9.900e+01 4.000e-01 0.000e+00 ... 1.000e+00 1.000e+00 1.000e+00]
 [8.350e+02 4.000e-01 0.000e+00 ... 1.000e+00 1.000e+00 1.000e+00]
 ...
 [7.761e+03 7.000e-01 0.000e+00 ... 1.000e+00 1.000e+00 1.000e+00]
 [3.617e+03 7.000e-01 0.000e+00 ... 1.250e+02 1.250e+02 1.250e+02]
 [1.784e+03 4.000e-01 0.000e+00 ... 2.390e+02 2.550e+02 1.750e+02]]
[['ARIAL']
 ['TIMES']
 ['TIMES']
 ...
 ['SERIF']
 ['TAHOMA']
 ['TAHOMA']]
(29221, 407)


In [39]:
print(X_train.shape)
print(y_train.shape)

(65000, 407)
(65000, 1)


### Data augmentation by adding shifted image of original training set
The first 65000 entries of the training set is the original training set.
The later 65000 entries of the training set is the shifted training set.
The way I have moved it is by moving row 0 (first row) of each image to be row 19 (last row), and shifting other rows one index forward. This way I will be able to conserve most of the shapes of the characters.

In [40]:
# Shifting
X_train_temp = (np.hstack((X_train,X_train[:,7:27])))
print(X_train_temp.shape)

X_train_shifted = np.delete(X_train_temp,[7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26],1)
print(X_train_shifted.shape)

# Augmenting
X_train_augmented = np.vstack((X_train, X_train_shifted))
y_train_augmented = np.vstack((y_train, y_train))
X_train_augmented.shape, y_train_augmented.shape

(65000, 427)
(65000, 407)


((130000, 407), (130000, 1))

### Use XgBoost classifier

In [49]:
start_time = time.time()
clf_xgb = XGBClassifier(n_estimators=2000, max_depth=9, objective="multi:softprob", random_state=42, learning_rate=0.2)
clf_xgb.fit(X_train_augmented, y_train_augmented.ravel())
train_time = time.time() - start_time
print("Training time %.3f seconds" % train_time)

start_time = time.time()
y_pred_xgb_d9_n2000_lr2_aug = clf_xgb.predict(X_test)
test_time = time.time() - start_time
y_pred_train = clf_xgb.predict(X_train)
print("Test time %.3f seconds" % test_time)
print("Train accuracy: ", str(accuracy_score(y_pred_train, y_train)))

Training time 47818.202 seconds
Test time 33.891 seconds
Train accuracy:  0.9954153846153846
