# Imports e Dados

In [33]:
import copy
import joblib
import pandas as pd
import sklearn

from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier


In [2]:
train_data = pd.read_csv('data/original/train.csv')
train_labels = train_data['label']
train_data = train_data.drop('label', axis=1)

test_data = pd.read_csv('data/original/test.csv')

In [3]:
ids = range(1, 28001)

# Basic Exploration

In [4]:
train_data.head()

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
train_data.columns

Index(['pixel0', 'pixel1', 'pixel2', 'pixel3', 'pixel4', 'pixel5', 'pixel6',
       'pixel7', 'pixel8', 'pixel9',
       ...
       'pixel774', 'pixel775', 'pixel776', 'pixel777', 'pixel778', 'pixel779',
       'pixel780', 'pixel781', 'pixel782', 'pixel783'],
      dtype='object', length=784)

In [6]:
train_data.shape

(42000, 784)

In [7]:
train_data.iloc[0].max()

255

# Preprocessing

In [8]:
from sklearn.decomposition import PCA

In [9]:
pca = PCA(n_components = 0.95)

In [10]:
X_train = train_data/255
X_test = test_data/255

In [11]:
pca.fit(X_train)

In [12]:
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

In [13]:
print("train: ", X_train.shape)
print("test: ", X_test.shape)

train:  (42000, 154)
test:  (28000, 154)


# Training the Model

In [28]:
grid = {
 'max_depth': [22, 24, 26], # 'max_depth': [16, 18, 20], 
 'n_estimators': [450, 500, 550], # 'n_estimators': [300, 350, 400],
} 

In [29]:
my_scorer = make_scorer(f1_score, greater_is_better=True, average='micro')
model = GridSearchCV(estimator = RandomForestClassifier(), param_grid = grid, cv = 3, verbose=2, scoring=my_scorer)

In [30]:
model.fit(X_train, train_labels)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] END .....................max_depth=22, n_estimators=450; total time= 2.6min
[CV] END .....................max_depth=22, n_estimators=450; total time= 2.6min
[CV] END .....................max_depth=22, n_estimators=450; total time= 2.6min
[CV] END .....................max_depth=22, n_estimators=500; total time= 2.9min
[CV] END .....................max_depth=22, n_estimators=500; total time= 2.8min
[CV] END .....................max_depth=22, n_estimators=500; total time= 2.8min
[CV] END .....................max_depth=22, n_estimators=550; total time= 3.1min
[CV] END .....................max_depth=22, n_estimators=550; total time= 3.1min
[CV] END .....................max_depth=22, n_estimators=550; total time= 3.1min
[CV] END .....................max_depth=24, n_estimators=450; total time= 2.6min
[CV] END .....................max_depth=24, n_estimators=450; total time= 2.6min
[CV] END .....................max_depth=24, n_est

In [31]:
print(model.best_params_)
print(model.best_estimator_.score(X_train, train_labels))
model.cv_results_

{'max_depth': 24, 'n_estimators': 550}
0.9999761904761905


{'mean_fit_time': array([153.33054757, 169.99086595, 186.24424394, 153.67504621,
        170.41137258, 187.94113008, 158.50898337, 171.69078294,
        189.68114487]),
 'std_fit_time': array([0.7518508 , 0.44926074, 0.73827197, 0.78291821, 1.18216198,
        0.52705438, 5.98792912, 0.18415808, 0.16163613]),
 'mean_score_time': array([0.85815398, 0.95734986, 1.04184651, 0.85560314, 0.94776615,
        1.03718368, 0.84466449, 0.93481223, 1.03914197]),
 'std_score_time': array([0.00591048, 0.00158888, 0.00951588, 0.0103229 , 0.00484483,
        0.01012818, 0.00400043, 0.00082753, 0.00863441]),
 'param_max_depth': masked_array(data=[22, 22, 22, 24, 24, 24, 26, 26, 26],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[450, 500, 550, 450, 500, 550, 450, 500, 550],
              mask=[False, False, False, False, False, False, False, False,
    

In [171]:
rfc = RandomForestClassifier(max_depth=24, n_estimators=550)
rfc.fit(X_train, train_labels)

In [35]:
predictions = rfc.predict(X_test)
predictions

array([2, 0, 9, ..., 3, 9, 2])

# Saving Results

In [46]:
joblib.dump(model, '/home/hyades/grid.pkl')
joblib.dump(rfc, '/home/hyades/model.pkl')

['/home/hyades/model.pkl']

In [36]:
results = {
    'ImageId': ids,
    'Label': predictions,
}
submit = pd.DataFrame(results)
submit

Unnamed: 0,ImageId,Label
0,1,2
1,2,0
2,3,9
3,4,4
4,5,3
...,...,...
27995,27996,9
27996,27997,7
27997,27998,3
27998,27999,9


In [37]:
submit.to_csv('submit.csv', index=False)

# Tested Params
- {'max_depth': 20, 'n_estimators': 400, 'f1-score': 0.9999523809523809} #'max_depth': [16, 18, 20] - 'n_estimators': [300, 350, 400]
- {'max_depth': 24, 'n_estimators': 550, 'f1-score': 0.9999761904761905} #'max_depth': [22, 24, 26] - 'n_estimators': [450, 500, 550]