In [30]:
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn import model_selection
from sklearn import metrics
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

# Zadanie
Proszę pobrać dane z 

http://archive.ics.uci.edu/ml/datasets/Auto+MPG

http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data

In [2]:
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"
names = ["mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "model year", "origin", "car name"]



In [3]:
data = pd.read_csv(url, names=names, delimiter="\s+", na_values=["?"])
print(data.shape)

(398, 9)


In [4]:
data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


# Zadanie
Proszę usunąć kolumny, które nie są numeyczne.

In [5]:
drop_list = ['car name']

In [6]:
data=data.drop(drop_list, axis=1)
data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


# Zadanie
Prosze usunąć brakujące dane.

In [7]:
null_counts = data.isnull().sum()
print("Number of null values in each column:\n{}".format(null_counts))

Number of null values in each column:
mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model year      0
origin          0
dtype: int64


In [8]:
data=data.dropna()
data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


# Zadanie
Proszę nauczyć wszytkie poznane modele regresji.
Który z nich działa najlepiej?

In [9]:
auto_target = data["acceleration"]
auto_data = data.drop(["acceleration"],axis=1)

In [10]:
auto_data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,model year,origin
0,18.0,8,307.0,130.0,3504.0,70,1
1,15.0,8,350.0,165.0,3693.0,70,1
2,18.0,8,318.0,150.0,3436.0,70,1
3,16.0,8,304.0,150.0,3433.0,70,1
4,17.0,8,302.0,140.0,3449.0,70,1


In [11]:
auto_target.head()

0    12.0
1    11.5
2    11.0
3    12.0
4    10.5
Name: acceleration, dtype: float64

In [12]:
y=auto_target
X=auto_data

In [19]:
from sklearn.model_selection import train_test_split 
seed=123 
X, X_test, y, y_test = train_test_split(X, y, test_size=0.2, random_state=seed) 

In [20]:
kfold = model_selection.KFold(n_splits=5)

In [25]:
%%capture --no-display
grid_4 = GridSearchCV(make_pipeline(PolynomialFeatures(degree=2), linear_model.LinearRegression()),
                    param_grid={'polynomialfeatures__degree': [1, 2, 3, 4]},
                    cv=kfold,
                    refit=True)
grid_4.fit(X, y)
grid_4.best_params_

{'polynomialfeatures__degree': 1}

In [26]:
metrics.r2_score(y_test, grid_4.best_estimator_.predict(X_test))

0.5187301849551149

In [33]:
%%capture --no-display
grid_1 = GridSearchCV(make_pipeline(PolynomialFeatures(degree=2), ElasticNet(alpha=1, random_state=seed)),
                    param_grid={'polynomialfeatures__degree': [1, 2, 3, 4],
                    #'elasticnet__alpha': [1, 100, 1000, 10000, 100000]},
                    #'elasticnet__alpha': [0.01+0.02*i for i in range(0,11)]},
                    #'elasticnet__alpha': [0.1, 0.2, 0.25, 0.3, 0.35]},
                    'elasticnet__alpha': [1000 - 2*100*i for i in range(6)] + [1000 + 2*1000*i for i in range(1, 6)]},          
                    cv=kfold,
                    refit=True)
grid_1.fit(X, y)
grid_1.best_params_

{'elasticnet__alpha': 600, 'polynomialfeatures__degree': 3}

In [34]:
metrics.r2_score(y_test, grid_1.best_estimator_.predict(X_test))

0.6862960748633168

In [43]:
%%capture --no-display
grid_2 = GridSearchCV(make_pipeline(PolynomialFeatures(degree=2), linear_model.Lasso(alpha=1, tol=0.1)),
                    param_grid={'polynomialfeatures__degree': [1, 2, 3, 4],
                    'lasso__alpha': [0.01, .1, 1, 10]},
                    cv=kfold,
                    refit=True)
grid_2.fit(X, y)
grid_2.best_params_

{'lasso__alpha': 0.1, 'polynomialfeatures__degree': 2}

In [44]:
metrics.r2_score(y_test, grid_2.best_estimator_.predict(X_test))

0.6990021500368556

In [38]:
%%capture --no-display
grid_3 = GridSearchCV(make_pipeline(PolynomialFeatures(degree=2), linear_model.Ridge()),
                    param_grid={'polynomialfeatures__degree': [1, 2, 3],
                    #'ridge__alpha': [0.01, 0.1, 1, 10, 100, 1000]},
                    'ridge__alpha': [100 - 2*10*i for i in range(6)] + [100 + 2*10*i for i in range(1, 6)]},
                    cv=kfold,
                    refit=True)
grid_3.fit(X, y)
grid_3.best_params_

{'polynomialfeatures__degree': 2, 'ridge__alpha': 180}

In [39]:
metrics.r2_score(y_test, grid_3.best_estimator_.predict(X_test))

0.6594161241455523