In [223]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from matplotlib import pyplot as plt
import seaborn as sns

def score_regression(model, X_train, y_train, X_test, y_test):
    print('Wynik R2 na zbiorze treningowym: ', model.score(X_train, y_train))
    print('Wynik R2 na zbiorze testowym: ', model.score(X_test, y_test))
    predictions = model.predict(X_test)
    
    errors = predictions - y_test
    errors = errors.astype(float)
    errors_stand = errors/y_test.std()
    mean_error = errors.mean()
    print('Średni błąd: ', mean_error)
    
    bound = len(errors)
    y_test_conv = np.sort(y_test.astype(float))
    fitted_poly = np.polyfit(y_test_conv, errors_stand, 3)
    poly_fun = np.poly1d(fitted_poly)
    fitted_y_up = [poly_fun(x)+0.5 for x in y_test_conv]
    fitted_y_down = [poly_fun(x)-0.5 for x in y_test_conv]

    plt.figure(figsize=(16,9), dpi=100)
    plt.subplot(3, 1, 1)
    sns.distplot(errors)
    plt.title('Rozkład wartości błędów')
    plt.xlabel('Wartość błędu')

    plt.subplot(3, 1, 2)
    plt.scatter(y_test_conv, errors_stan)
    plt.fill_between(y_test_conv, fitted_y_up, fitted_y_down, alpha=0.1)
    plt.title('Wartość błędu w funkcji targetu')
    plt.ylabel('Błąd ustandaryzowany')
    plt.xlabel('Wartość rzeczywista')

    plt.subplot(3, 1, 3)
    plt.scatter([i for i in range(0, bound)], errors_stan)
    plt.plot([0, bound], [errors.mean()/y_test.std(), errors.mean()/y_test.std()], '--', lw=3)
    plt.fill_between([0, bound], [1, 1], [errors.mean()-1, errors.mean()-1], alpha=0.1)
    plt.ylabel('Błąd ustandaryzowany')
    plt.subplots_adjust(left=0.05,
                        bottom=0.05, 
                        right=0.95, 
                        top=0.95, 
                        wspace=0.1, 
                        hspace=0.3)

In [181]:
with open('column_names.txt') as f:
    columns_names = f.readlines()
columns_names = [col_nm.replace('\n', '') for col_nm in columns_names]
columns_names

['mpg',
 'cylinders',
 'displacement',
 'horsepower',
 'weight',
 'acceleration',
 'model year',
 'origin',
 'car name']

In [182]:
data = pd.read_csv('auto-mpg_data.csv', sep=',', header=None, verbose=True)
data.columns = columns_names
data

Tokenization took: 1.00 ms
Type conversion took: 1.00 ms
Parser memory cleanup took: 0.00 ms


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8.0,307.0,130.0,3504.0,12.0,70.0,1.0,chevrolet chevelle malibu
1,15.0,8.0,350.0,165.0,3693.0,11.5,70.0,1.0,buick skylark 320
2,18.0,8.0,318.0,150.0,3436.0,11.0,70.0,1.0,plymouth satellite
3,16.0,8.0,304.0,150.0,3433.0,12.0,70.0,1.0,amc rebel sst
4,17.0,8.0,302.0,140.0,3449.0,10.5,70.0,1.0,ford torino
...,...,...,...,...,...,...,...,...,...
401,27.0,4.0,140.0,86.0,2790.0,15.6,82.0,1.0,ford mustang gl
402,44.0,4.0,97.0,52.0,2130.0,24.6,82.0,2.0,vw pickup
403,32.0,4.0,135.0,84.0,2295.0,11.6,82.0,1.0,dodge rampage
404,28.0,4.0,120.0,79.0,2625.0,18.6,82.0,1.0,ford ranger


In [183]:
data = data[data['mpg'].isnull() == False]
data.shape

(398, 9)

In [184]:
data['model year'] = data['model year'].apply(lambda year: 93 - year)
data = data.rename(columns={'model year' : 'years'})
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['model year'] = data['model year'].apply(lambda year: 93 - year)


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,years,origin,car name
0,18.0,8.0,307.0,130.0,3504.0,12.0,23.0,1.0,chevrolet chevelle malibu
1,15.0,8.0,350.0,165.0,3693.0,11.5,23.0,1.0,buick skylark 320
2,18.0,8.0,318.0,150.0,3436.0,11.0,23.0,1.0,plymouth satellite
3,16.0,8.0,304.0,150.0,3433.0,12.0,23.0,1.0,amc rebel sst
4,17.0,8.0,302.0,140.0,3449.0,10.5,23.0,1.0,ford torino
...,...,...,...,...,...,...,...,...,...
401,27.0,4.0,140.0,86.0,2790.0,15.6,11.0,1.0,ford mustang gl
402,44.0,4.0,97.0,52.0,2130.0,24.6,11.0,2.0,vw pickup
403,32.0,4.0,135.0,84.0,2295.0,11.6,11.0,1.0,dodge rampage
404,28.0,4.0,120.0,79.0,2625.0,18.6,11.0,1.0,ford ranger


In [185]:
X = data.values[:, 1:-1]
y = data.values[:, 0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, shuffle=True)
X_train.shape
y_train.shape

(318,)

In [None]:
%%time
from sklearn.linear_model import ElasticNet
pipe = Pipeline([
    ('imputation', SimpleImputer(strategy='most_frequent'))
    ,('scaling', MinMaxScaler())
    ,('regression', ElasticNet(random_state=1))
])
parameters = {
    'regression__alpha' : np.logspace(-3, 0, 100)
    ,'regression__l1_ratio' : np.linspace(0, 1, 100)
}
search = GridSearchCV(pipe, parameters, n_jobs=4, cv=4, verbose=3)
search = search.fit(X_train, y_train)
search.best_params_

Fitting 4 folds for each of 10000 candidates, totalling 40000 fits


In [224]:
score_regression(search, X_train, y_train, X_test, y_test)

Wynik R2 na zbiorze treningowym:  0.8095974019783
Wynik R2 na zbiorze testowym:  0.8645119031675297
Średni błąd:  -0.01660382835651215


