In [1]:
import numpy as np

# Bonus

👇 Consider the following dataset

In [123]:
import pandas as pd

data = pd.read_csv("data.csv")

len(data)

205

👇 Build an optimal pipeline to predict the price of cars according to their specificities. Once your pipeline is ready, use `permutation_importance` to find out which feature is the most informative of the car price.

In [3]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_selector
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler
from sklearn.compose import ColumnTransformer

In [4]:
X = data.drop(columns = 'price')
y = data['price']

In [5]:
num_col = make_column_selector(dtype_include=['float64','int64'])

cat_col = make_column_selector(dtype_include=['object','bool'])

ct = ColumnTransformer([("num_t",MinMaxScaler(),num_col),
                        ("cat_t",OneHotEncoder(sparse=False),cat_col)])
X_new = ct.fit_transform(X)

X_new = pd.DataFrame(X_new)

X_new.columns = ct.get_feature_names_out()

X_new

Unnamed: 0,num_t__car_ID,num_t__symboling,num_t__wheelbase,num_t__carlength,num_t__carwidth,num_t__carheight,num_t__curbweight,num_t__enginesize,num_t__boreratio,num_t__stroke,...,cat_t__cylindernumber_twelve,cat_t__cylindernumber_two,cat_t__fuelsystem_1bbl,cat_t__fuelsystem_2bbl,cat_t__fuelsystem_4bbl,cat_t__fuelsystem_idi,cat_t__fuelsystem_mfi,cat_t__fuelsystem_mpfi,cat_t__fuelsystem_spdi,cat_t__fuelsystem_spfi
0,0.000000,1.0,0.058309,0.413433,0.316667,0.083333,0.411171,0.260377,0.664286,0.290476,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.004902,1.0,0.058309,0.413433,0.316667,0.083333,0.411171,0.260377,0.664286,0.290476,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.009804,0.6,0.230321,0.449254,0.433333,0.383333,0.517843,0.343396,0.100000,0.666667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.014706,0.8,0.384840,0.529851,0.491667,0.541667,0.329325,0.181132,0.464286,0.633333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.019608,0.8,0.373178,0.529851,0.508333,0.541667,0.518231,0.283019,0.464286,0.633333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,0.980392,0.2,0.655977,0.711940,0.716667,0.641667,0.567882,0.301887,0.885714,0.514286,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
201,0.985294,0.2,0.655977,0.711940,0.708333,0.641667,0.605508,0.301887,0.885714,0.514286,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
202,0.990196,0.2,0.655977,0.711940,0.716667,0.641667,0.591156,0.422642,0.742857,0.380952,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
203,0.995098,0.2,0.655977,0.711940,0.716667,0.641667,0.670675,0.316981,0.335714,0.633333,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [169]:
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR

model = ElasticNet().fit(X_new, y) # Fit model

permutation_score = permutation_importance(model, X_new, y, n_repeats=10, scoring='r2') # Perform Permutation

importance_df = pd.DataFrame(np.vstack((X_new.columns,
                                        permutation_score.importances_mean)).T) # Unstack results
importance_df.columns=['feature','score decrease']


In [170]:
importance_df.sort_values(by="score decrease", ascending = False)

Unnamed: 0,feature,score decrease
187,cat_t__cylindernumber_four,0.087947
175,cat_t__drivewheel_rwd,0.056957
174,cat_t__drivewheel_fwd,0.046501
197,cat_t__fuelsystem_mpfi,0.035472
193,cat_t__fuelsystem_2bbl,0.024905
...,...,...
24,cat_t__CarName_audi fox,-0.0
69,cat_t__CarName_mazda glc,-0.0
59,cat_t__CarName_honda prelude,-0.000001
90,cat_t__CarName_nissan kicks,-0.000001


In [171]:
len(importance_df[importance_df['score decrease']>0.001])

29

In [187]:
nb_features = 40
good_features = importance_df.sort_values(by="score decrease", ascending = False)['feature'][:nb_features].values

X_gd_features = X_new[good_features]

In [188]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(LinearRegression(), X_gd_features, y, cv = 10)
score = scores.mean()

In [189]:
score

-5.259201638285227e+17