# Part 1 : Decision Trees (Regression)
- Hyperparameter Optimization (Grid Search)

## Import Libraries

In [1]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error

## Import Data

In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/car-price-prediction/CarPrice_Assignment.csv
/kaggle/input/car-price-prediction/Data Dictionary - carprices.xlsx


In [3]:
data = pd.read_csv('/kaggle/input/car-price-prediction/CarPrice_Assignment.csv')

### Describe Data

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            205 non-null    int64  
 1   symboling         205 non-null    int64  
 2   CarName           205 non-null    object 
 3   fueltype          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   doornumber        205 non-null    object 
 6   carbody           205 non-null    object 
 7   drivewheel        205 non-null    object 
 8   enginelocation    205 non-null    object 
 9   wheelbase         205 non-null    float64
 10  carlength         205 non-null    float64
 11  carwidth          205 non-null    float64
 12  carheight         205 non-null    float64
 13  curbweight        205 non-null    int64  
 14  enginetype        205 non-null    object 
 15  cylindernumber    205 non-null    object 
 16  enginesize        205 non-null    int64  
 1

## Partition Data into X and y

In [5]:
# Partition Data into X and y
y = data['price']
X = data.drop(['car_ID','price'], axis = 1)

# Create Dummy Variables
X = pd.get_dummies(X)
X.head()

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Decision Tree with Hyperparameter

### 1/ Initiate an Instance

In [6]:
dt_model = DecisionTreeRegressor(random_state = 42)

### 2/ Create Parameter Grid

In [7]:
dt_param_grid = {
    'criterion' : ['squared_error', 'poisson', 'absolute_error'],
    'max_depth' : [3,4,5,6,7,8,9,10]
}

### 3/ Initiate a GridSearchCV instance
- model
- parameter grid
- scoring

In [8]:
gs_dt = GridSearchCV(dt_model, param_grid = dt_param_grid, scoring = 'neg_root_mean_squared_error') #neg_mean_squared_error

### 4/ Fit Grid Search to get Best Estimators

In [9]:
gs_dt.fit(X_train,y_train)

# Store Best Estimator
best_dt_estimates = gs_dt.best_estimator_
best_dt_estimates

DecisionTreeRegressor(max_depth=6, random_state=42)

### 5/ Fit Model with Best Estimator

In [10]:
best_dt_estimates.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=6, random_state=42)

### 6/ Make predictions

In [11]:
y_train_predicted = best_dt_estimates.predict(X_train)
y_test_predicted = best_dt_estimates.predict(X_test)

### 7/ Check Performance

#### Check train & test Performance

In [12]:
performance_dict = {
        'Model_Name' : gs_dt.best_estimator_,
        'Train_RMSE' : round(mean_squared_error(y_train, y_train_predicted, squared=False),2),
        'Test_RMSE'  : round(mean_squared_error(y_test, y_test_predicted, squared=False),2)
    }

In [13]:
performance = pd.DataFrame([performance_dict])
performance

Unnamed: 0,Model_Name,Train_RMSE,Test_RMSE
0,"DecisionTreeRegressor(max_depth=6, random_stat...",988.61,2462.68


# How did the model perform?