In [5]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score

In [6]:
df = pd.read_csv('../mnist/data/train.csv')

In [7]:
df.head(10)

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
df.shape

(42000, 785)

In [9]:
y = df['label']

In [10]:
X = df.drop(labels='label', axis=1)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=17)

# DecisionTreeClassifier

In [35]:
from sklearn.tree import DecisionTreeClassifier

In [40]:
%%time # 46.3 s
params_grid = {'max_depth' : list(range(10, 16))}
tree_grid = GridSearchCV(DecisionTreeClassifier(criterion='entropy'), n_jobs=-1, param_grid=params_grid)

tree_grid.fit(X_train, y_train)

print(tree_grid.best_params_, tree_grid.best_score_, tree_grid.best_estimator_)
# {'max_depth': 13} 0.8480272108843537

print(cross_val_score(tree_grid.best_estimator_, X_test, y_test, cv=5))
# [0.82362267 0.82434576 0.81078937 0.82168388 0.82034976]

{'max_depth': 13} 0.8480272108843537 DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=13,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
[0.82362267 0.82434576 0.81078937 0.82168388 0.82034976]
Wall time: 46.3 s


# KNeighborsClassifier

In [13]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [17]:
%%time # 2min 38s
# 4nn - 0.9637301587301588
# 5nn - 0.9654761904761905
# 6nn - 0.9637301587301588
knn = KNeighborsClassifier(n_jobs=-1, n_neighbors=6)
knn.fit(X_train, y_train)
print(accuracy_score(y_test, knn.predict(X_test)))

0.9637301587301588
Wall time: 2min 38s


In [None]:
# Слишком долго ждать, т.к. кол-во испытаний равно кол-ву вариантов соседей и кол-ву повторов gridsearch 

knn_pipe = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier(n_jobs=-1))])

knn_params = {'knn__n_neighbors': range(4, 7)}

knn_grid = GridSearchCV(knn_pipe, knn_params, cv=2, n_jobs=-1, verbose=True)

knn_grid.fit(X_train, y_train)

knn_grid.best_params_, knn_grid.best_score_

print(accuracy_score(y_test, knn_grid.predict(X_test)))

# RandomForest

In [18]:
from sklearn.ensemble import RandomForestClassifier

In [21]:
%%time # 847 ms

forest = RandomForestClassifier(n_jobs=-1)

forest.fit(X_train, y_train)

print(accuracy_score(y_test, forest.predict(X_test))) # 0.9338888888888889

0.9338888888888889
Wall time: 847 ms


In [26]:
%%time  # 4min 56s
forest_params = {'n_estimators' : list(np.arange(50, 160, 10)), 'criterion' : ['gini', 'entropy']}

forest_grid = GridSearchCV(RandomForestClassifier(), n_jobs=-1, param_grid=forest_params)

forest_grid.fit(X_train, y_train)

print(forest_grid.best_params_, forest_grid.best_score_, forest_grid.best_estimator_)
# {'criterion': 'gini', 'n_estimators': 150} 0.9595578231292518

{'criterion': 'gini', 'n_estimators': 150} 0.9595578231292518 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Wall time: 4min 56s


In [25]:
cross_val_score(forest_grid.best_estimator_, X_test, y_test, cv=5) 
# [0.95005945, 0.94964314, 0.95200317, 0.94479746, 0.94594595]

array([0.95005945, 0.94964314, 0.95200317, 0.94479746, 0.94594595])

# Logistic Regression

In [27]:
from sklearn.linear_model import LogisticRegression

In [31]:
%%time  # 22 minutes
linreg = LogisticRegression(n_jobs=-1)

linreg.fit(X_train, y_train)

accuracy_score(y_test, linreg.predict(X_test))    # 0.8998412698412699

0.8998412698412699

In [33]:
# долго ждать, т.к. одна итерация занимает 22 минуты

linreg_params = {'C' : list(np.logspace(0, 4, 10)), 'penalty' : ['l1', 'l2']}

linreg_grid = GridSearchCV(LogisticRegression(n_jobs=-1), param_grid=linreg_params)

linreg_grid.fit(X_train, y_train)

print(linreg_grid.best_params_, linreg_grid.best_score_, linreg_grid.best_estimator_)

# потом можно произвести кросс-валидацию, чтоб уточнить значение
cross_val_score(linreg_grid.best_estimator_, X_test, y_test, cv=5)


hi


# Gradient Boosting 
### loss = deviance  -> logistic regression

In [57]:
from sklearn.ensemble import GradientBoostingClassifier

In [58]:
%%time # 10min 29s

gbm = GradientBoostingClassifier(loss='deviance') #  

gbm.fit(X_train, y_train)

print(accuracy_score(y_test, gbm.predict(X_test))) # 0.9414285714285714

0.9414285714285714
Wall time: 10min 29s


In [None]:
# ждать gridsearch по градиентному бустингу также очень долго, поэтому описывать не будем.