# Regression Example with Sklearn

With Linear Regression and kNN.

## Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

## Load Data

In [2]:
df = pd.read_csv('data/diabetes.csv')
df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0


In [3]:
df.shape

(442, 11)

## Train / Test

In [4]:
X = df.drop('target', axis=1)
y = df['target']

# Train / Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

## Cross-validation

In [5]:
# Declare KFold
kf = KFold(n_splits=10, shuffle=True, random_state=42)

In [6]:
# Declare scores to be used
scoring = {
    'MSE': make_scorer(mean_squared_error),
    'MAE': make_scorer(mean_absolute_error),
}

## Baseline

In [7]:
bl = DummyRegressor(strategy='mean')
cv_results = cross_validate(bl, X_train, y_train, cv=kf,
                            scoring=scoring, return_train_score=True)

print('Train MSE:', cv_results['train_MSE'].mean().round(1))
print('Validation MSE:', cv_results['test_MSE'].mean().round(1))
print()
print('Train MAE:', cv_results['train_MAE'].mean().round(1))
print('Validation MAE:', cv_results['test_MAE'].mean().round(1))

Train MSE: 6074.5
Validation MSE: 6110.8

Train MAE: 66.4
Validation MAE: 66.7


## Linear Regression

In [8]:
lr = Pipeline([
    ('scale', StandardScaler()),
    ('lr', LinearRegression())
])

cv_results = cross_validate(lr, X_train, y_train, cv=kf,
                            scoring=scoring, return_train_score=True)

print('Train MSE:', cv_results['train_MSE'].mean().round(1))
print('Validation MSE:', cv_results['test_MSE'].mean().round(1))
print()
print('Train MAE:', cv_results['train_MAE'].mean().round(1))
print('Validation MAE:', cv_results['test_MAE'].mean().round(1))

Train MSE: 2858.4
Validation MSE: 3067.3

Train MAE: 43.4
Validation MAE: 45.1


## k-Nearest Neighbors

In [9]:
# kNN model with 5 neighbors and Euclidean distance

knn = Pipeline([
    ('scale', StandardScaler()),
    ('knn', KNeighborsRegressor(n_neighbors=5, p=2, n_jobs=-1))
])

cv_results = cross_validate(knn, X_train, y_train, cv=kf,
                            scoring=scoring, return_train_score=True)

print('Train MSE:', cv_results['train_MSE'].mean().round(1))
print('Validation MSE:', cv_results['test_MSE'].mean().round(1))
print()
print('Train MAE:', cv_results['train_MAE'].mean().round(1))
print('Validation MAE:', cv_results['test_MAE'].mean().round(1))

Train MSE: 2582.9
Validation MSE: 3870.8

Train MAE: 40.2
Validation MAE: 50.0
