In [23]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.dummy import DummyRegressor

data = pd.read_csv('ford.csv')
df = pd.DataFrame(data)

#separate X and y
X = df.drop(columns = ['price'])
y = df['price']

#get dummies
categorical_columns = X.select_dtypes(include=['object']).columns.tolist()
numerical_columns = X.select_dtypes(include=['int64','float64']).columns.tolist()

df_encoded = pd.get_dummies(X,columns=categorical_columns)

X = pd.concat([df_encoded,df[numerical_columns]],axis=1)

#scale
scaler = StandardScaler()

X = scaler.fit_transform(X)

#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#define the model
base_svr = SVR()

#define a Grid Search
param_grid = {
    'kernel' : ['linear' , 'rbf' , 'poly'],
    'C' : [0.1,1,10,100]
}

grid_search = GridSearchCV(base_svr,param_grid,cv = 5, scoring = 'neg_mean_squared_error')
grid_search.fit(X_train,y_train)

#get best parameters for model
best_params = grid_search.best_params_

svr = SVR(**best_params)
svr.fit(X_train,y_train)

#make predictions
y_train_pred = svr.predict(X_train)
y_test_pred = svr.predict(X_test)
cv = -cross_val_score(svr,X,y,cv=5,scoring = 'neg_mean_squared_error')

#calculate mse
mse_train = mean_squared_error(y_train,y_train_pred)
mse_test = mean_squared_error(y_test,y_test_pred)
mse_cv = np.mean(cv)

#define a baseline model
baseline = DummyRegressor()
baseline.fit(X_train,y_train)
baseline_train_pred = baseline.predict(X_train)
baseline_test_pred = baseline.predict(X_test)
baseline_cv = -cross_val_score(baseline,X,y,cv = 5, scoring = 'neg_mean_squared_error')
baseline_train_mse = mean_squared_error(y_train,baseline_train_pred)
baseline_test_mse = mean_squared_error(y_test,baseline_test_pred)
baseline_cv_mse = np.mean(baseline_cv)

#r squared
r_squared_train = svr.score(X_train,y_train)
r_squared_test = svr.score(X_test,y_test)

#print
print('mse train = ',mse_train)
print('mse test = ',mse_test)
print('mse cv = ',mse_cv)
print('----------------------')
print('mse baseline train = ',baseline_train_mse)
print('mse baseline test = ',baseline_test_mse)
print('mse baseline cv = ',baseline_cv_mse)
print('----------------------')
print('Ratio mse train = ',baseline_train_mse / mse_train)
print('Ratio mse test = ',baseline_test_mse / mse_test)
print('Ratio mse cv = ',baseline_cv_mse / mse_cv)
print('----------------------')
print('r_squared (train)',r_squared_train)
print('r_squared (test)',r_squared_test)

mse train =  3660221.2794294856
mse test =  3478582.2030032664
mse cv =  4055504.643148951
----------------------
mse baseline train =  22494365.559566088
mse baseline test =  22419483.48616434
mse baseline cv =  22927737.505292255
----------------------
Ratio mse train =  6.145629961222524
Ratio mse test =  6.445006090932182
Ratio mse cv =  5.653485699745052
----------------------
r_squared (train) 0.8372827511077361
r_squared (test) 0.8447896581410542
