## DecisionTreeClassifier

In [1]:
from sklearn.datasets import load_digits
# импортируем набор данных с рукописными цифрами и размечаем его на признаковое 
# описание Х и целевую переменную у
data = load_digits()
X, y = data.data, data.target

# X[0,:].reshape([8,8])

In [2]:
# матрица пикселей
X[0, : ].reshape([8,8])

array([[ 0.,  0.,  5., 13.,  9.,  1.,  0.,  0.],
       [ 0.,  0., 13., 15., 10., 15.,  5.,  0.],
       [ 0.,  3., 15.,  2.,  0., 11.,  8.,  0.],
       [ 0.,  4., 12.,  0.,  0.,  8.,  8.,  0.],
       [ 0.,  5.,  8.,  0.,  0.,  9.,  8.,  0.],
       [ 0.,  4., 11.,  0.,  1., 12.,  7.,  0.],
       [ 0.,  2., 14.,  5., 10., 12.,  0.,  0.],
       [ 0.,  0.,  6., 13., 10.,  0.,  0.,  0.]])

Для того чтобы воспользоваться "train_test_split", необходимо данные data сначала разделить на матрицу признаков X и целевую переменную y. Затем мы импортируем метод из sklearn.model_selection и непосредственно используем его:

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.3,
random_state=17)

In [4]:
# импортируем из библиотеки алгоритм решающего дерева
from sklearn.tree import DecisionTreeClassifier

In [5]:
# импортируем из библиотеки метрику качества "аккуратность"
from sklearn.metrics import accuracy_score

In [6]:
tree = DecisionTreeClassifier(random_state = 12)

In [7]:
tree_params = {'max_depth' : range(1,11), 
              'max_features' : range(5, 20)}
from sklearn.model_selection import GridSearchCV
tree_grid = GridSearchCV(tree,tree_params, cv = 5, n_jobs = -1)

In [8]:
tree_grid.fit(X, y)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=12), n_jobs=-1,
             param_grid={'max_depth': range(1, 11),
                         'max_features': range(5, 20)})

In [9]:
tree_grid.best_params_, tree_grid.best_score_

({'max_depth': 10, 'max_features': 15}, 0.787440420922315)

In [10]:
tree = DecisionTreeClassifier(max_depth = 10, max_features = 15, random_state = 12)

In [11]:
tree.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=10, max_features=15, random_state=12)

In [12]:
pred = tree.predict(X_holdout)

In [13]:
accuracy_score(y_holdout, pred)

0.8407407407407408

In [14]:
# accuracy_score(y_holdout, pred) 0.8407407407407408

## kNN

In [15]:
# импотируем из библиотеки алгоритм ближайших соседей
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()

In [16]:
knn_params = {'n_neighbors' : range(1, 5),
             'metric' : ["minkowski", "manhattan", "euclidean", "chebyshev"]}
knn_grid = GridSearchCV(knn, knn_params, cv = 3, n_jobs = -1)

In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train)
x_train_scaled = scaler.transform(X_train)

scaler = StandardScaler().fit(X_holdout)
x_test_scaled = scaler.transform(X_holdout)

In [18]:
knn_grid.fit(x_train_scaled, y_train)

GridSearchCV(cv=3, estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'metric': ['minkowski', 'manhattan', 'euclidean',
                                    'chebyshev'],
                         'n_neighbors': range(1, 5)})

In [19]:
knn_grid.best_params_, knn_grid.best_score_

({'metric': 'minkowski', 'n_neighbors': 3}, 0.9665871121718377)

In [20]:
knn = KNeighborsClassifier(metric='minkowski', n_neighbors = 3)

In [21]:
knn.fit(x_train_scaled, y_train)

KNeighborsClassifier(n_neighbors=3)

In [22]:
pred = knn_grid.predict(x_test_scaled)

In [23]:
accuracy_score(y_holdout, pred)

0.9740740740740741