In [1]:
import os
import urllib.request
import pandas as p
import sklearn.cross_validation as val
from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import scale

In [2]:
def print_answer_to_file(question_number, *answers):
    with open(str(question_number) + ".txt", "w") as text_file:
        print(" ".join(str(x) for x in answers), file=text_file, end="")

In [3]:
NUMBER_OF_NEIGHBOURS_PARAM_NAME = "n_neighbors"

#### Загрузите выборку Wine 

In [4]:
folder = "data"
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"

In [5]:
if not os.path.exists(folder):
    os.mkdir(folder)

In [6]:
path = os.path.join(folder, url.split('/')[-1])

In [7]:
urllib.request.urlretrieve(url, path)

('data/wine.data', <http.client.HTTPMessage at 0x10a4dc940>)

#### Извлеките из данных признаки и классы.

In [8]:
data = p.read_csv(path, header=None)

In [9]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [10]:
y, X = data.loc[:,[0]], data.loc[:,list(range(1, 14))]

In [11]:
X.columns = ["Alcohol", "Malic acid", "Ash", "Alcalinity of ash", "Magnesium", "Total phenols", "Flavanoids",
             "Nonflavanoid phenols", "Proanthocyanins", "Color intensity", "Hue", "OD280/OD315 of diluted wines", 
             "Proline"]
y.columns = ["Class"]

In [12]:
X.head()

Unnamed: 0,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


#### Оценку качества необходимо провести методом кросс-валидации по 5 блокам

In [13]:
folds_gen = val.KFold(len(X), n_folds=5, shuffle=True, random_state=42)

#### Найдите точность классификации на кросс-валидации

In [14]:
classifier = KNeighborsClassifier()

In [15]:
params = { NUMBER_OF_NEIGHBOURS_PARAM_NAME: list(range(1, 51))}

In [16]:
grid = GridSearchCV(classifier, params, cv = folds_gen)

In [17]:
grid.fit(X = X, y = y.Class)

GridSearchCV(cv=sklearn.cross_validation.KFold(n=178, n_folds=5, shuffle=True, random_state=42),
       error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [18]:
best_params = grid.best_params_
print(best_params)
print_answer_to_file(1, best_params[NUMBER_OF_NEIGHBOURS_PARAM_NAME])

{'n_neighbors': 1}


In [19]:
best_score = grid.best_score_
print(best_score)
print_answer_to_file(2, round(best_score, 2))

0.730337078652


#### Произведите масштабирование признаков

In [20]:
scaled_X = scale(X)

In [21]:
scaled_grid = GridSearchCV(classifier, params, cv = folds_gen)

In [22]:
scaled_grid.fit(X = scaled_X, y = y.Class)

GridSearchCV(cv=sklearn.cross_validation.KFold(n=178, n_folds=5, shuffle=True, random_state=42),
       error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

#### Какое значение k получилось оптимальным после приведения признаков к одному масштабу?

In [23]:
scaled_best_params = scaled_grid.best_params_
print(scaled_best_params)
print_answer_to_file(3, scaled_best_params[NUMBER_OF_NEIGHBOURS_PARAM_NAME])

{'n_neighbors': 29}


In [24]:
scaled_best_score = scaled_grid.best_score_
print(scaled_best_score)
print_answer_to_file(4, round(scaled_best_score, 2))

0.977528089888
