## Описание датасета
База данных о детях (их семьях, родителях и т.д.) для ранжирования заявок в детские сады.
Входные данные:

Parents

Has Nurse

Form

Children

Housing

Finance

Social

Health

Выходной параметр:

CLASS

In [14]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn as sk
%matplotlib inline

In [15]:
data = pd.read_csv('nursery.csv', header=None, sep = ',')
data.columns = ['Parents', 'Has Nurse', 'Form', 'Children', 'Housing', 'Finance', 'Social', 'Health', 'CLASS']

In [16]:
data.head()

Unnamed: 0,Parents,Has Nurse,Form,Children,Housing,Finance,Social,Health,CLASS
0,usual,proper,complete,1,convenient,convenient,nonprob,recommended,recommend
1,usual,proper,complete,1,convenient,convenient,nonprob,priority,priority
2,usual,proper,complete,1,convenient,convenient,nonprob,not_recom,not_recom
3,usual,proper,complete,1,convenient,convenient,slightly_prob,recommended,recommend
4,usual,proper,complete,1,convenient,convenient,slightly_prob,priority,priority


Выше - начало таблицы с данными, ниже - конец.

In [17]:
data.tail()

Unnamed: 0,Parents,Has Nurse,Form,Children,Housing,Finance,Social,Health,CLASS
12955,great_pret,very_crit,foster,more,critical,inconv,slightly_prob,priority,spec_prior
12956,great_pret,very_crit,foster,more,critical,inconv,slightly_prob,not_recom,not_recom
12957,great_pret,very_crit,foster,more,critical,inconv,problematic,recommended,spec_prior
12958,great_pret,very_crit,foster,more,critical,inconv,problematic,priority,spec_prior
12959,great_pret,very_crit,foster,more,critical,inconv,problematic,not_recom,not_recom


In [18]:
print(data.shape)

(12960, 9)


In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12960 entries, 0 to 12959
Data columns (total 9 columns):
Parents      12960 non-null object
Has Nurse    12960 non-null object
Form         12960 non-null object
Children     12960 non-null object
Housing      12960 non-null object
Finance      12960 non-null object
Social       12960 non-null object
Health       12960 non-null object
CLASS        12960 non-null object
dtypes: object(9)
memory usage: 1012.5+ KB


Статистические характеристики. Так как все признаки категориальные, не увидим привычных mean std и т.д. Пустых ячеек в датасете нет, поэтому count везде одинаковое.

In [20]:
data.describe()

Unnamed: 0,Parents,Has Nurse,Form,Children,Housing,Finance,Social,Health,CLASS
count,12960,12960,12960,12960,12960,12960,12960,12960,12960
unique,3,5,4,4,3,2,3,3,5
top,usual,improper,incomplete,1,convenient,convenient,problematic,not_recom,not_recom
freq,4320,2592,3240,3240,4320,6480,4320,4320,4320


In [21]:
classes = {'not_recom': 1, 'recommend': 2, 'very_recom': 3, 'priority': 4, 'spec_prior': 5 }
y = [classes[item] for item in data['CLASS']]
data = data.drop('CLASS', axis=1)

Векторизация. Отдельно выделим бинарные признаки (binary_columns)

In [22]:
categorical_columns = [i for i in data.columns if data[i].dtype.name == 'object']
print(categorical_columns)
numerical_columns   = [i for i in data.columns if data[i].dtype.name != 'object']
print(numerical_columns) #тут будет пусто
data_describe = data.describe(include = [object])
binary_columns = [c for c in categorical_columns if data_describe[c]['unique'] == 2]
nonbinary_columns = [c for c in categorical_columns if data_describe[c]['unique'] > 2]
print(binary_columns, nonbinary_columns)

['Parents', 'Has Nurse', 'Form', 'Children', 'Housing', 'Finance', 'Social', 'Health']
[]
['Finance'] ['Parents', 'Has Nurse', 'Form', 'Children', 'Housing', 'Social', 'Health']


In [23]:
new_data = pd.get_dummies(data[categorical_columns])
#new_data.head()
#data = pd.concat((data_nonbinary, data[binary_columns]), axis = 0)
data = data.drop(categorical_columns, axis=1)
data = data.join(new_data)
data.head()

Unnamed: 0,Parents_great_pret,Parents_pretentious,Parents_usual,Has Nurse_critical,Has Nurse_improper,Has Nurse_less_proper,Has Nurse_proper,Has Nurse_very_crit,Form_complete,Form_completed,...,Housing_critical,Housing_less_conv,Finance_convenient,Finance_inconv,Social_nonprob,Social_problematic,Social_slightly_prob,Health_not_recom,Health_priority,Health_recommended
0,0,0,1,0,0,0,1,0,1,0,...,0,0,1,0,1,0,0,0,0,1
1,0,0,1,0,0,0,1,0,1,0,...,0,0,1,0,1,0,0,0,1,0
2,0,0,1,0,0,0,1,0,1,0,...,0,0,1,0,1,0,0,1,0,0
3,0,0,1,0,0,0,1,0,1,0,...,0,0,1,0,0,0,1,0,0,1
4,0,0,1,0,0,0,1,0,1,0,...,0,0,1,0,0,0,1,0,1,0


In [24]:
numerical_columns   = [i for i in data.columns if data[i].dtype.name != 'object']
data_numerical = data[numerical_columns]
data_numerical = (data_numerical - data_numerical.mean(axis = 0))/data_numerical.std(axis = 0)

## Метод К ближайших соседей

In [25]:
x = data

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 33)

size_train, _ = x_train.shape 
size_test,  _ = x_test.shape 

print('ololo: ', size_test, size_train)

ololo:  3888 9072


In [26]:
from sklearn.neighbors import KNeighborsClassifier
k = 1
knn = KNeighborsClassifier(n_neighbors = k)
knn.fit(x_train, y_train)

y_train_predict = knn.predict(x_train)
y_test_predict = knn.predict(x_test)

err_train = np.mean(y_train != y_train_predict)
err_test  = np.mean(y_test  != y_test_predict)

print('errors: ',err_train, err_test)

from sklearn.model_selection import GridSearchCV
k = range(1,8)
knn = KNeighborsClassifier()
grid = GridSearchCV(knn, param_grid = {'n_neighbors': k}, cv=4)
grid.fit(x_train, y_train)

best_cv_err = 1 - grid.best_score_
best_k = grid.best_estimator_.n_neighbors
print(best_cv_err, best_k)


knn = KNeighborsClassifier(n_neighbors = best_k).fit(x_train, y_train)

err_train = np.mean(y_train != knn.predict(x_train))
err_test  = np.mean(y_test  != knn.predict(x_test))

print(err_train, err_test)

errors:  0.0 0.188014403292
0.0486111111111 7
0.0259038800705 0.0434670781893


