# Decision tree regressor implementation

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
from sklearn.datasets import fetch_california_housing

In [5]:
california_df = fetch_california_housing()

In [6]:
california_df.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])

In [7]:
california_df.feature_names

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [8]:
df = pd.DataFrame(data=california_df.data,columns=california_df.feature_names)

In [9]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [10]:
california_df.target

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

In [11]:
X=df.iloc[:,:]
y=california_df.target

In [12]:
X.shape,y.shape

((20640, 8), (20640,))

# DecisionTree Regressor

In [19]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import mean_squared_error,mean_absolute_error,accuracy_score,r2_score


In [14]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=42)

In [15]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((16512, 8), (4128, 8), (16512,), (4128,))

In [16]:
regressor = DecisionTreeRegressor()

In [17]:
regressor.fit(X_train,y_train)

In [18]:
y_pred = regressor.predict(X_test)

In [20]:
mse = mean_squared_error(y_test,y_pred)

In [21]:
mse

0.498142584722844

In [22]:
score = r2_score(y_pred,y_test)

In [23]:
score

0.625187475941244

In [None]:
plt.figure(figsize=(12,6))
tree.plot_tree(regressor,filled=True)

[Text(0.6465270326523406, 0.9857142857142858, 'x[0] <= 5.086\nsquared_error = 1.337\nsamples = 16512\nvalue = 2.072'),
 Text(0.3670152740686614, 0.9571428571428572, 'x[0] <= 3.074\nsquared_error = 0.834\nsamples = 13101\nvalue = 1.74'),
 Text(0.1636100244743965, 0.9285714285714286, 'x[2] <= 4.314\nsquared_error = 0.561\nsamples = 6268\nvalue = 1.358'),
 Text(0.07451581482734702, 0.9, 'x[0] <= 2.215\nsquared_error = 0.679\nsamples = 2624\nvalue = 1.626'),
 Text(0.0301654196117347, 0.8714285714285714, 'x[2] <= 3.422\nsquared_error = 0.575\nsamples = 1404\nvalue = 1.377'),
 Text(0.011677490485070078, 0.8428571428571429, 'x[3] <= 1.027\nsquared_error = 0.832\nsamples = 482\nvalue = 1.775'),
 Text(0.004218656596390001, 0.8142857142857143, 'x[7] <= -121.825\nsquared_error = 0.459\nsamples = 167\nvalue = 1.436'),
 Text(0.0013492607514136081, 0.7857142857142857, 'x[7] <= -121.96\nsquared_error = 1.416\nsamples = 26\nvalue = 1.777'),
 Text(0.0010748765825427137, 0.7571428571428571, 'x[5] <= 4.2

# using GridSearchCV

In [None]:
regressor = DecisionTreeRegressor()
parameters = {
    'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
    'splitter':['best','random'],
    'max_depth':[1,2,3,4,5
    
}
clf = GridSearchCV(regressor,param_grid=parameters,scoring='accuracy')

In [None]:
clf.