In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.dummy import DummyRegressor

#get the data
data = pd.read_csv('ford.csv')
df = pd.DataFrame(data)

#define X and y
df = df.head(2000)
X = df.drop(columns=['price'])
y = df['price']

#split into numeric and non numeric columns
non_numeric = X.select_dtypes(include=['object']).columns.tolist()
numeric = X.select_dtypes(include=['int64','float64']).columns.tolist()

#encode data
df_encoded = pd.get_dummies(X,columns = non_numeric)
X = pd.concat([df_encoded,df[numeric]],axis=1)

#scale data
scaler = StandardScaler()
X = scaler.fit_transform(X)

#split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#define a model
regressor = DecisionTreeRegressor()

#define a param_grid to perform a pre and post pruning
param_grid = {
    'max_depth': [5,10,15],
    'min_samples_split':[5,10,15],
    'min_samples_leaf': [2,5,8],
    'max_leaf_nodes': [10,15,20],
    'max_features': [1.0,'sqrt'],
    'ccp_alpha' : [0.001,0.01,0.1,1,10]
}

#define a grid search
grid_search = GridSearchCV(regressor,param_grid,cv=5,scoring = 'neg_mean_squared_error')
grid_search.fit(X_train,y_train)

#get the best parameters for the model
best_params = grid_search.best_params_

#make a new instance of the regression model with the new best parameters
best_regressor = DecisionTreeRegressor(max_depth=best_params['max_depth'],
                                  min_samples_split=best_params['min_samples_split'],
                                  min_samples_leaf=best_params['min_samples_leaf'],
                                  max_leaf_nodes=best_params['max_leaf_nodes'],
                                  max_features=best_params['max_features'],
                                ccp_alpha = best_params['ccp_alpha']
                                 )

#fit the new model
best_regressor.fit(X_train,y_train)

#make predictions
y_train_pred = best_regressor.predict(X_train)
y_test_pred = best_regressor.predict(X_test)
R_squared = best_regressor.score(X_test,y_test)

#get cross_validation_score
cv = -cross_val_score(best_regressor,X,y,scoring = 'neg_mean_squared_error',cv = 5)

#create a baseline model to compare the values of mean squared error
baseline_model = DummyRegressor(strategy = 'mean')
baseline_model.fit(X_train,y_train)
y_train_baseline_pred = baseline_model.predict(X_train)
y_test_baseline_pred = baseline_model.predict(X_test)
mse_baseline_train = mean_squared_error(y_train, y_train_baseline_pred)
mse_baseline_test = mean_squared_error(y_test, y_test_baseline_pred)
print('Baseline Model - MSE (Train):', mse_baseline_train)
print('Baseline Model - MSE (Test):', mse_baseline_test)

#get mean squared error
mse_train = mean_squared_error(y_train,y_train_pred)
mse_test = mean_squared_error(y_test,y_test_pred)
mse_cv = np.mean(cv)

#print
print('R squared = ',R_squared)
print('mse train = ',mse_train)
print('mse test = ',mse_test)
print('mse cv = ',mse_cv)