In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures

import warnings
from sklearn.exceptions import ConvergenceWarning
from scipy.linalg import LinAlgWarning
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
warnings.filterwarnings(action='ignore', category=LinAlgWarning)
warnings.filterwarnings(action='ignore', category=UserWarning)

# Zadanie
Proszę podzielić dane na train test. Nstępnie wykonać walidację krzyzową na train i zewaluować na test:
* ElasticNet	
* Lasso	
* Ridge	
* LR	
* SVR	
* RFR	

Proszę nie zmieniać preprocesingu danych

https://archive.ics.uci.edu/ml/datasets/automobile

https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data

In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
names = ['symboling','normalized-losses','make','fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']

In [3]:
data = pd.read_csv(url, names=names, delimiter=",", na_values=["?"])
print(data.shape)

(205, 26)


In [4]:
data.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


# Zadanie
Proszę usunąć kolumny, które nie są numeyczne.

In [5]:
drop_list = ['symboling','make','make','fuel-type', 'aspiration', 'num-of-doors', 
             'body-style', 'drive-wheels', 'engine-location','engine-type','num-of-cylinders','fuel-system']

In [6]:
data=data.drop(drop_list, axis=1)
data.head()

Unnamed: 0,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,164.0,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,164.0,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


# Zadanie
Prosze usunąć brakujące dane.

In [7]:
null_counts = data.isnull().sum()
print("Number of null values in each column:\n{}".format(null_counts))

Number of null values in each column:
normalized-losses    41
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-size           0
bore                  4
stroke                4
compression-ratio     0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 4
dtype: int64


In [8]:
data=data.dropna()
data.head()

Unnamed: 0,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
3,164.0,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,164.0,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0
6,158.0,105.8,192.7,71.4,55.7,2844,136,3.19,3.4,8.5,110.0,5500.0,19,25,17710.0
8,158.0,105.8,192.7,71.4,55.9,3086,131,3.13,3.4,8.3,140.0,5500.0,17,20,23875.0
10,192.0,101.2,176.8,64.8,54.3,2395,108,3.5,2.8,8.8,101.0,5800.0,23,29,16430.0


# Zadanie
Proszę nauczyć wszytkie poznane modele regresji.
Który z nich działa najlepiej?

In [9]:
auto_target = data["price"]
auto_data = data.drop(["price"],axis=1)

In [10]:
auto_data.head()

Unnamed: 0,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg
3,164.0,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,102.0,5500.0,24,30
4,164.0,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,115.0,5500.0,18,22
6,158.0,105.8,192.7,71.4,55.7,2844,136,3.19,3.4,8.5,110.0,5500.0,19,25
8,158.0,105.8,192.7,71.4,55.9,3086,131,3.13,3.4,8.3,140.0,5500.0,17,20
10,192.0,101.2,176.8,64.8,54.3,2395,108,3.5,2.8,8.8,101.0,5800.0,23,29


In [11]:
auto_target.head()

3     13950.0
4     17450.0
6     17710.0
8     23875.0
10    16430.0
Name: price, dtype: float64

In [12]:
y=auto_target
X=auto_data

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

seed = 123
kfold = KFold(n_splits=5, random_state=seed, shuffle=True)

In [14]:
grid_1 = GridSearchCV(make_pipeline(PolynomialFeatures(degree=2), ElasticNet(alpha=1, random_state=seed)),
                    param_grid={
                        'polynomialfeatures__degree': [1, 2, 3, 4],
                        'elasticnet__alpha': [0.01, 0.1, 1, 10]
                    }, cv=kfold, refit=True)
grid_1.fit(X_train, y_train)
grid_1.best_params_

{'elasticnet__alpha': 10, 'polynomialfeatures__degree': 1}

In [15]:
grid_2 = GridSearchCV(make_pipeline(PolynomialFeatures(degree=2), Lasso(alpha=1, random_state=seed)),
                    param_grid={
                        'polynomialfeatures__degree': [1, 2, 3, 4],
                        'lasso__alpha': [0.01, 0.1, 1, 10]
                    }, cv=kfold, refit=True)
grid_2.fit(X_train, y_train)
grid_2.best_params_

{'lasso__alpha': 10, 'polynomialfeatures__degree': 1}

In [16]:
grid_3 = GridSearchCV(make_pipeline(PolynomialFeatures(degree=2), Ridge(alpha=1, random_state=seed)),
                    param_grid={
                        'polynomialfeatures__degree': [1, 2, 3, 4],
                        'ridge__alpha': [0.01, 0.1, 1, 10]
                    }, cv=kfold, refit=True)
grid_3.fit(X_train, y_train)
grid_3.best_params_

{'polynomialfeatures__degree': 1, 'ridge__alpha': 10}

In [17]:
grid_4 = GridSearchCV(make_pipeline(PolynomialFeatures(degree=2), LinearRegression()),
                    param_grid={'polynomialfeatures__degree': [1, 2, 3, 4]},
                    cv=kfold, refit=True)
grid_4.fit(X_train, y_train)
grid_4.best_params_

{'polynomialfeatures__degree': 1}

In [18]:
grid_5 = GridSearchCV(SVR(kernel='rbf', gamma=0.1),
                   param_grid={
                       'C': [0.001, 0.01, 0.1, 1, 10 ,100, 1000],
                       'gamma': [0.001, 0.01, 0.1, 1, 10 ,100, 1000]
                   }, cv=kfold)
grid_5.fit(X_train, y_train)
grid_5.best_params_

{'C': 1000, 'gamma': 0.001}

In [19]:
grid_6 = GridSearchCV(RandomForestRegressor(random_state=3),
                   param_grid={
                       'max_depth': [1, 5, 10, 15, 20, 25],
                       'max_features': ['auto', 'sqrt', 'log2'],
                       'n_estimators': [5, 10, 15, 20, 25, 30]
                   }, cv=kfold)

grid_6.fit(X_train, y_train)
grid_6.best_params_

{'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 25}

In [20]:
X_test = X_test
y_test = y_test

models = []
models.append(('ElasticNet', grid_1.best_estimator_))
models.append(('Lasso', grid_2.best_estimator_))
models.append(('Ridge', grid_3.best_estimator_))
models.append(('LR', grid_4.best_estimator_))
models.append(('SVR', grid_5.best_estimator_))
models.append(('RFR', grid_6.best_estimator_))

r2 = []
explained_variance_score = []
median_absolute_error = []
mean_squared_error = []
mean_absolute_error = []
for name, model in models:
    print(name)
    print("R^2: {}".format(metrics.r2_score(y_test, model.predict(X_test))))
    print("Explained variance score: {}".format(metrics.explained_variance_score(y_test, model.predict(X_test))))
    print("Median absolute error: {}".format(metrics.median_absolute_error(y_test, model.predict(X_test))))
    print("Mean squared error: {}".format(metrics.mean_squared_error(y_test, model.predict(X_test))))
    print("Mean absolute errors: {}".format(metrics.mean_absolute_error(y_test, model.predict(X_test))))
    print()
    r2.append(metrics.r2_score(y_test, model.predict(X_test)))
    explained_variance_score.append(metrics.explained_variance_score(y_test, model.predict(X_test)))
    median_absolute_error.append(metrics.median_absolute_error(y_test, model.predict(X_test)))
    mean_squared_error.append(metrics.mean_squared_error(y_test, model.predict(X_test)))
    mean_absolute_error.append(metrics.mean_absolute_error(y_test, model.predict(X_test)))

ElasticNet
R^2: 0.8180728547573288
Explained variance score: 0.8215773426590849
Median absolute error: 1141.1090471087737
Mean squared error: 4962475.461548957
Mean absolute errors: 1585.0980159798653

Lasso
R^2: 0.79233179230125
Explained variance score: 0.7944360686338403
Median absolute error: 1068.2050738633916
Mean squared error: 5664621.315715471
Mean absolute errors: 1680.9771970500378

Ridge
R^2: 0.8146541831138945
Explained variance score: 0.8156178861854784
Median absolute error: 1137.3566193485094
Mean squared error: 5055727.48349987
Mean absolute errors: 1640.4420602862156

LR
R^2: 0.778511941133442
Explained variance score: 0.7811707341356812
Median absolute error: 1075.2001844298793
Mean squared error: 6041589.097027196
Mean absolute errors: 1708.7734859070947

SVR
R^2: 0.029410100758887836
Explained variance score: 0.08794797550499767
Median absolute error: 1884.6727677110757
Mean squared error: 26475040.609176632
Mean absolute errors: 3290.706887612722

RFR
R^2: 0.88615

In [21]:
d = {
    'r2': r2,
    'explained_variance_score': explained_variance_score,
    'median_absolute_error': median_absolute_error,
    'mean_squared_error': mean_squared_error,
    'mean_absolute_error': mean_absolute_error,
}
df = pd.DataFrame(data=d)
df.insert(loc=0, column='Method', value=['ElasticNet', 'Lasso', 'Ridge', 'LR', 'SVR', 'RFR'])
df

Unnamed: 0,Method,r2,explained_variance_score,median_absolute_error,mean_squared_error,mean_absolute_error
0,ElasticNet,0.818073,0.821577,1141.109047,4962475.0,1585.098016
1,Lasso,0.792332,0.794436,1068.205074,5664621.0,1680.977197
2,Ridge,0.814654,0.815618,1137.356619,5055727.0,1640.44206
3,LR,0.778512,0.781171,1075.200184,6041589.0,1708.773486
4,SVR,0.02941,0.087948,1884.672768,26475040.0,3290.706888
5,RFR,0.886152,0.889125,1004.433333,3105462.0,1305.736708
