In [15]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

from jcopml.tuning import grid_search_params as gsp

In [11]:
df = pd.read_csv("data/carprice.csv", index_col="ID")
df.head()

Unnamed: 0_level_0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage_kmpl,Engine_CC,Power_bhp,Seats,Price
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67,1582.0,126.2,5.0,12.5
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2,1199.0,88.7,5.0,4.5
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2,1968.0,140.8,5.0,17.74
6,Nissan Micra Diesel XV,Jaipur,2013,86999,Diesel,Manual,First,23.08,1461.0,63.1,5.0,3.5


In [12]:
df.isna().sum()

Name                   0
Location               0
Year                   0
Kilometers_Driven      0
Fuel_Type              0
Transmission           0
Owner_Type             0
Mileage_kmpl           2
Engine_CC             36
Power_bhp            142
Seats                 42
Price                  0
dtype: int64

In [13]:
df.drop(columns="Name", inplace=True)

df.shape

(5953, 11)

### Dataset Splitting

In [14]:
X = df.drop(columns="Price")
y = df.Price

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4762, 10), (1191, 10), (4762,), (1191,))

### Training

In [21]:
preprocessor = ColumnTransformer([
    ('numeric', num_pipe(scaling='robust', poly=2), ["Year", "Kilometers_Driven", "Mileage_kmpl", "Engine_CC", "Power_bhp", "Seats"]),
    ('categoric', cat_pipe(encoder='onehot'), ["Location", "Fuel_Type", "Transmission", "Owner_Type"]),
])


pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', SVR(max_iter=500))
])


parameter = {
    'prep__numeric__poly__degree': [1, 2, 3],
    'prep__numeric__poly__interaction_only': [True, False],
    'algo__gamma': [1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03],
    'algo__C': [1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]
}


model = GridSearchCV(pipeline, parameter, cv=2, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

Fitting 2 folds for each of 294 candidates, totalling 588 fits




{'algo__C': 100.0, 'algo__gamma': 0.01, 'prep__numeric__poly__degree': 1, 'prep__numeric__poly__interaction_only': True}
0.8093353730381811 0.8345948962656303 0.7564216743994782


### Save Model

In [22]:
import pickle, os

os.makedirs("Models", exist_ok=True)

filename = "Models/svm_carPrice.pkl"

pickle.dump(model.best_estimator_, open(filename, "wb")) # without report
# pickle.dump(model, open("knn_titanic.pkl", "wb"))  with report