In [1]:
from sklearn import model_selection, linear_model, metrics
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

# Регрессия

Задача на kaggle: https://www.kaggle.com/c/bike-sharing-demand

По историческим данным о прокате велосипедов и погодным условиям необходимо оценить спрос на прокат велосипедов.

In [2]:
raw_data = pd.read_csv('bike_sharing_demand.csv', header = 0, sep = ',')
raw_data.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


Предскажем кол-во велосипедов с помощью линейной регрессии

## Препроцессинг

In [3]:
raw_data.isnull().values.any()

False

In [4]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
datetime      10886 non-null object
season        10886 non-null int64
holiday       10886 non-null int64
workingday    10886 non-null int64
weather       10886 non-null int64
temp          10886 non-null float64
atemp         10886 non-null float64
humidity      10886 non-null int64
windspeed     10886 non-null float64
casual        10886 non-null int64
registered    10886 non-null int64
count         10886 non-null int64
dtypes: float64(3), int64(8), object(1)
memory usage: 1020.7+ KB


In [5]:
raw_data.datetime = raw_data.datetime.apply(pd.to_datetime)

Вместо Datetime сделаем два признака month и hour

In [6]:
raw_data['month'] = raw_data.datetime.apply(lambda x : x.month)
raw_data['hour'] = raw_data.datetime.apply(lambda x : x.hour)

raw_data = raw_data.drop(['datetime'], axis = 1)

In [7]:
raw_data.head()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,month,hour
0,1,0,0,1,9.84,14.395,81,0.0,3,13,16,1,0
1,1,0,0,1,9.02,13.635,80,0.0,8,32,40,1,1
2,1,0,0,1,9.02,13.635,80,0.0,5,27,32,1,2
3,1,0,0,1,9.84,14.395,75,0.0,3,10,13,1,3
4,1,0,0,1,9.84,14.395,75,0.0,0,1,1,1,4


Признаки casual и registered - кол-во незарегестрированных и зарегестрированных пользователей, что в сумме дает целевую метку - кол-во всех пользователей, взявших велосипед на прокат

In [8]:
np.all(raw_data.registered + raw_data.casual == raw_data['count'])

True

Удалим эти признаки

In [9]:
raw_data.drop(['casual', 'registered'], axis = 1, inplace = True)

Сформируем выборки

In [10]:
#обучающая выборка
train_labels = raw_data.iloc[:-1000, :]['count'].values
train_data = raw_data.iloc[:-1000, :].drop(['count'], axis = 1)

#тестовая выборка
test_labels = raw_data.iloc[-1000:, :]['count'].values
test_data = raw_data.iloc[-1000:, :].drop(['count'], axis = 1)

In [11]:
print(raw_data.shape, train_data.shape, test_data.shape)

(10886, 11) (9886, 10) (1000, 10)


In [12]:
print(train_labels.shape, test_labels.shape)

(9886,) (1000,)


## Обучение

In [13]:
#создаем стандартный scaler
scaler = StandardScaler()

#модель - стохастический градиентный спуск
regressor = linear_model.SGDRegressor(random_state = 0)

#создаем pipeline из двух шагов: scaling и регрессия
pipeline = Pipeline(steps = [('scaling', scaler), ('regression', regressor)])

In [14]:
pipeline.fit(train_data, train_labels)
metrics.mean_absolute_error(test_labels, pipeline.predict(test_data))

121.83793258867532

### Подбор параметров

Подберем оптимальные параметры с помощью кросс валидации

In [15]:
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'scaling', 'regression', 'scaling__copy', 'scaling__with_mean', 'scaling__with_std', 'regression__alpha', 'regression__average', 'regression__early_stopping', 'regression__epsilon', 'regression__eta0', 'regression__fit_intercept', 'regression__l1_ratio', 'regression__learning_rate', 'regression__loss', 'regression__max_iter', 'regression__n_iter_no_change', 'regression__penalty', 'regression__power_t', 'regression__random_state', 'regression__shuffle', 'regression__tol', 'regression__validation_fraction', 'regression__verbose', 'regression__warm_start'])

In [16]:
parameters_grid = {
    'regression__loss' : ['huber', 'epsilon_insensitive', 'squared_loss', ],
    'regression__max_iter' : [3, 5, 10, 50], 
    'regression__penalty' : ['l1', 'l2', 'none'],
    'regression__alpha' : [0.0001, 0.01],
    'scaling__with_mean' : [0., 0.5],
}

In [17]:
grid_cv = model_selection.GridSearchCV(pipeline, parameters_grid, scoring = 'neg_mean_absolute_error', cv = 4)

In [18]:
grid_cv.fit(train_data, train_labels)

GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('scaling',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('regression',
                                        SGDRegressor(alpha=0.0001,
                                                     average=False,
                                                     early_stopping=False,
                                                     epsilon=0.1, eta0=0.01,
                                                     fit_intercept=True,
                                                     l1_ratio=0.15,
                                                     learning_rate='invscaling',
                                                     loss='squared_loss',
      

In [19]:
print(grid_cv.best_score_)
print(grid_cv.best_params_)

-108.50546872459662
{'regression__alpha': 0.01, 'regression__loss': 'squared_loss', 'regression__max_iter': 3, 'regression__penalty': 'l2', 'scaling__with_mean': 0.0}


## Оценка модели

In [20]:
metrics.mean_absolute_error(test_labels, grid_cv.best_estimator_.predict(test_data))

121.64188924277826

Видим, что линейная регрессия на этих данных работает плохо, подбор параметров не помог. Возможно стоит попробовать другую модель, например случайный лес.

# Классификация

Для классификации будет использоваться датасет digits из sklearn.datasets

In [21]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

Разделим выборку на тренировочную и тестовую

In [25]:
dig = datasets.load_digits()
data = dig.data
target = dig.target

X_train, X_test, y_train, y_test = map(pd.DataFrame, train_test_split(data,target,test_size=0.25,shuffle = False))

X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,9.0,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0


In [26]:
knn = KNeighborsClassifier()
svm = SVC()

print(knn.get_params().keys(),'\n')
print(svm.get_params().keys())

dict_keys(['algorithm', 'leaf_size', 'metric', 'metric_params', 'n_jobs', 'n_neighbors', 'p', 'weights']) 

dict_keys(['C', 'cache_size', 'class_weight', 'coef0', 'decision_function_shape', 'degree', 'gamma', 'kernel', 'max_iter', 'probability', 'random_state', 'shrinking', 'tol', 'verbose'])


Подберем наилучшие параметры с помощью кросс валидации

In [27]:
param_grid_knn = {'n_neighbors' : [1,2,3,5,7],}
param_grid_svm = {'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
                  'C' : [0.5, 1, 1.5, 2]}

In [28]:
grid_cv_knn = model_selection.GridSearchCV(knn, param_grid_knn, scoring = 'neg_mean_absolute_error', cv = 4)
grid_cv_svm = model_selection.GridSearchCV(svm, param_grid_svm, scoring = 'neg_mean_absolute_error', cv = 4)

In [29]:
grid_cv_knn.fit(X_train, y_train)
grid_cv_svm.fit(X_train, y_train)

print(grid_cv_knn.best_params_)
print(grid_cv_svm.best_params_,'\n')

print('Точность модели KNN - {}'.format(accuracy_score(grid_cv_knn.best_estimator_.predict(X_test), y_test)))
print('Точность модели SVM - {}'.format(accuracy_score(grid_cv_svm.best_estimator_.predict(X_test), y_test)))

{'n_neighbors': 2}
{'C': 0.5, 'kernel': 'poly'} 

Точность модели KNN - 0.9622222222222222
Точность модели SVM - 0.9533333333333334


Видим, что у модели KNN результат немного лучше, чем у SVM