In [31]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import randint
from sklearn.metrics import mean_squared_error
from sklearn.dummy import DummyRegressor

data = pd.read_csv('ford.csv')
df = pd.DataFrame(data)

#separate X and y
X = df.drop(columns = ['price'])
y = df['price']

#get dummies
categorical_columns = X.select_dtypes(include=['object']).columns.tolist()
numerical_columns = X.select_dtypes(include=['int64','float64']).columns.tolist()

df_encoded = pd.get_dummies(X,columns=categorical_columns)

X = pd.concat([df_encoded,df[numerical_columns]],axis=1)

#scale
scaler = StandardScaler()

X = scaler.fit_transform(X)

#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#define a model
regressor = RandomForestRegressor()

param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(5,20),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(2, 20),
    'max_features': [1.0, 'sqrt'],
    'random_state': [42]
}

random_search = RandomizedSearchCV(regressor,param_distributions = param_dist, cv = 5, scoring = 'neg_mean_squared_error')

random_search.fit(X_train,y_train)

best_params = random_search.best_params_

best_regressor = RandomForestRegressor(**best_params)

best_regressor.fit(X_train,y_train)

#predict values
y_train_pred = best_regressor.predict(X_train)
y_test_pred = best_regressor.predict(X_test)

#cross Validation score
cv = -cross_val_score(best_regressor,X,y,cv = 5,scoring='neg_mean_squared_error')

#define a baseline model
baseline_regressor = DummyRegressor()
baseline_regressor.fit(X_train,y_train)
baseline_train_pred = baseline_regressor.predict(X_train)
baseline_test_pred = baseline_regressor.predict(X_test)
baseline_cv = -cross_val_score(baseline_regressor,X,y,cv = 5,scoring = 'neg_mean_squared_error')

#calculate mse for baseline
baseline_mse_train = mean_squared_error(y_train,baseline_train_pred)
baseline_mse_test = mean_squared_error(y_test,baseline_test_pred)
baseline_mse_cv = np.mean(baseline_cv)

#calculate mse
mse_train = mean_squared_error(y_train,y_train_pred)
mse_test = mean_squared_error(y_test,y_test_pred)
mse_cv = np.mean(cv)
r_squared_train = best_regressor.score(X_train,y_train)
r_squared_test = best_regressor.score(X_test,y_test)

#print
print('mse train = ',mse_train)
print('mse test = ',mse_test)
print('mse cv = ',mse_cv)
print('-----------')
print('baseline mse train = ',baseline_mse_train)
print('baseline mse test = ',baseline_mse_test)
print('baseline mse cv = ',baseline_mse_cv)
print('-----------')
print('Ratio baseline_train_mse / mse_train = ',baseline_mse_train/mse_train)
print('Ratio baseline_test_mse / mse_test = ',baseline_mse_test/mse_test)
print('Ratio baseline_cv_mse / mse_cv = ',baseline_mse_cv/mse_cv)
print('-----------')
print('r_squared (train)',r_squared_train)
print('r_squared (test)',r_squared_test)

mse train =  743596.5400625424
mse test =  1467904.9882805683
mse cv =  1577517.5598844509
-----------
baseline mse train =  22494365.559566088
baseline mse test =  22419483.48616434
baseline mse cv =  22927737.505292255
-----------
Ratio baseline_train_mse / mse_train =  30.250766844173498
Ratio baseline_test_mse / mse_test =  15.273116220161782
Ratio baseline_cv_mse / mse_cv =  14.534061672804235
-----------
r_squared (train) 0.9669429867629089
r_squared (test) 0.9345037656862684
