In [19]:
import pandas as pd

df = pd.read_csv("./ready.csv")
df.shape

(11611, 8)

In [20]:
selected_features = ["Manufacturer", "Model", "Prod. year", "Color", "Engine volume", "Mileage"]
target = "Price"
X = df[selected_features]
y = df[target]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, 
                                                    test_size = 0.3, 
                                                    random_state=42)
print(X_train.shape, y_train.shape)

(8127, 6) (8127,)


In [21]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

one_hot_encoder = OneHotEncoder(handle_unknown='ignore')

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=5, include_bias=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', one_hot_encoder, ['Manufacturer', 'Model', 'Color']),
        ('num', numeric_transformer, ['Engine volume', 'Mileage', "Prod. year"])
    ]
)
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

parameters = {"preprocessor__num__poly__degree":[2, 3, 4, 5, 7, 9]}
poly_grid = GridSearchCV(pipeline, parameters, 
                         cv=5, 
                         scoring='neg_mean_squared_error',
                         verbose= True) 

poly_grid.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [22]:

best_parameters = poly_grid.best_params_
# print the values of the parameters 
print ('best order is :', best_parameters)

best order is : {'preprocessor__num__poly__degree': 3}


In [23]:
y_train_pred = poly_grid.predict(X_train)
y_pred = poly_grid.predict(X_test)

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

mae = mean_absolute_error(y_train, y_train_pred)
print('Mean absolute error: ', round(mae, 2))
mse = mean_squared_error(y_train, y_train_pred)
print('Mean squared error: ', round(mse, 2))
r2 = r2_score(y_train, y_train_pred)
print('R2 score is: ', r2)

mae = mean_absolute_error(y_test, y_pred)
print('Mean absolute error: ', round(mae, 2))
mse = mean_squared_error(y_test, y_pred)
print('Mean squared error: ', round(mse, 2))
r2 = r2_score(y_test, y_pred)
print('R2 score is: ', r2)

Mean absolute error:  8646.62
Mean squared error:  144790600.29
R2 score is:  0.4007279545475949
Mean absolute error:  8742.92
Mean squared error:  145081382.37
R2 score is:  0.40796335552067053
