## Decision Tree Regressor

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
from sklearn.datasets import fetch_california_housing

In [4]:
calf=fetch_california_housing()

In [5]:
print(calf.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [6]:
calf.target_names

['MedHouseVal']

In [7]:
x=pd.DataFrame(calf.data,columns=calf.feature_names)

In [8]:
x.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [9]:
y=calf.target

In [10]:
y

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

In [11]:
## train test split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.33,random_state=42)

In [12]:
## Decision tree regressor
from sklearn.tree import DecisionTreeRegressor

In [13]:
dtr=DecisionTreeRegressor()

In [14]:
dtr.fit(x_train,y_train)

In [15]:
y_pred=dtr.predict(x_test)

In [16]:
y_pred

array([0.417  , 0.521  , 5.00001, ..., 1.33   , 1.405  , 5.00001])

from sklearn import tree
plt.figure(figsize=(15,10))
tree.plot_tree(dtr,filled=True)


In [17]:
## it is regression so we will use r2 score to chech accuracy,in case of decision tree classifier we use confusion matrix,accuracy_score

In [18]:
from sklearn.metrics import r2_score

In [19]:
score=r2_score(y_test,y_pred)

In [20]:
score

0.5907317178595219

In [21]:
## Hyperparameter Tuning
parameter={'criterion':['squared_error','friedman_mse','absolute_error','poisson'],
          'splitter':['best','random'],
          'max_depth':[1,2,3,4,5,6,7,8,9,10,11,12],
          'max_features':['auto','sqrt','log2']}
reg_tree=DecisionTreeRegressor()

In [22]:
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

In [24]:
gsc=GridSearchCV(reg_tree,param_grid=parameter,cv=5,scoring='neg_mean_squared_error')

In [None]:
gsc.fit(x_train,y_train)

In [None]:
y_pred=gsc.predict(x_test)
score=r2_score(y_test,y_pred)

In [None]:
score