In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.tree import _tree
from sklearn.tree._tree import TREE_LEAF


In [2]:
housing_data = pd.read_csv("data/housing.csv")

In [3]:
housing_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
housing_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20433 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [5]:
housing_data['total_bedrooms'] = housing_data['total_bedrooms'].fillna((housing_data['total_bedrooms'].sum())/len(housing_data['total_bedrooms']))


housing_data = housing_data[housing_data['median_house_value'] < (housing_data['median_house_value'].max())]


In [6]:
housing_cat = housing_data['ocean_proximity']
housing_cat_encoded,housing_categories = housing_cat.factorize()
encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot = housing_cat_1hot.toarray()
ocean_proximity_cat_1hot = pd.DataFrame(housing_cat_1hot)
ocean_proximity_cat_1hot = ocean_proximity_cat_1hot.rename(columns=
                            {0: 'NEAR BAY', 1: '<1H OCEAN', 2:'INLAND', 3:'NEAR OCEAN', 4:'ISLAND'})
housing_data = housing_data.reset_index(drop=True)
housing_data = pd.concat([housing_data, ocean_proximity_cat_1hot], axis=1, sort=False)
housing_data = housing_data.drop('ocean_proximity',axis=1)


In [7]:
housing_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,NEAR BAY,<1H OCEAN,INLAND,NEAR OCEAN,ISLAND
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,1.0,0.0,0.0,0.0,0.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,1.0,0.0,0.0,0.0,0.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,1.0,0.0,0.0,0.0,0.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,1.0,0.0,0.0,0.0,0.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,1.0,0.0,0.0,0.0,0.0


In [8]:
class Tree :
    def __init__ (self,X_train,y_train,X_test,y_test):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        
    def fit_tree (self,depth):
        dTree = DecisionTreeRegressor(max_depth=depth)
        return dTree.fit(self.X_train,self.y_train)
    
    def predict_tree (self,depth):
        pred = self.fit_tree(depth).predict(self.X_test)
        return pred
    
    def evaluate_tree (self,depth):
        mse = mean_squared_error(self.y_test,self.predict_tree(depth))
        rmse = np.sqrt(mse)
        return rmse
        
        
        
        

In [9]:
housing_train, housing_test = train_test_split(housing_data, test_size=0.33, random_state=11)


In [10]:
housing_train['median_house_value']

11445    117300.0
7646     200200.0
15823    121500.0
3280      77300.0
19618    137500.0
943      315600.0
10228    317900.0
5424     248100.0
12193     65900.0
11073    215600.0
4119     185400.0
2629     135600.0
10034    336700.0
8084     142500.0
15218    275500.0
2396      54700.0
14358    131000.0
7327     167000.0
10312    177500.0
11826     68200.0
18007    105900.0
13710    156300.0
771      173600.0
4415     134900.0
12411    244300.0
6291     157500.0
7111     161800.0
6961     149300.0
8143     241400.0
1393     182000.0
           ...   
11527     95300.0
19598    132500.0
13557    111300.0
4908     101000.0
19545     74800.0
3095      86500.0
11968     67000.0
17778    114200.0
1350     165900.0
4753     350000.0
8600     137500.0
9307     200000.0
19537    325000.0
7636     345700.0
17639    171300.0
15397    300000.0
8332     427200.0
6690     167900.0
13548    118300.0
3885     196700.0
15776    200000.0
7933     106300.0
5724     192600.0
6765     179800.0
16967    3

In [11]:
lat_income_train = Tree(housing_train[['latitude','median_income']],
                        housing_train[['median_house_value']],
                        housing_train[['latitude','median_income']],
                        housing_train[['median_house_value']])

lat_income_test =  Tree(housing_train[['latitude','median_income']],
                        housing_train[['median_house_value']],
                        housing_test[['latitude','median_income']],
                        housing_test[['median_house_value']])

loc_income_train = Tree(housing_train[['latitude','longitude','median_income']],
                        housing_train[['median_house_value']],
                        housing_train[['latitude','longitude','median_income']],
                        housing_train[['median_house_value']])

loc_income_test =  Tree(housing_train[['latitude','longitude','median_income']],
                        housing_train[['median_house_value']],
                        housing_test[['latitude','longitude','median_income']],
                        housing_test[['median_house_value']])



all_train = Tree(housing_train.drop('median_house_value',axis=1),
                        housing_train[['median_house_value']],
                        housing_train.drop('median_house_value',axis=1),
                        housing_train[['median_house_value']])

all_test =  Tree(housing_train.drop('median_house_value',axis=1),
                        housing_train[['median_house_value']],
                        housing_test.drop('median_house_value',axis=1),
                        housing_test[['median_house_value']])




In [12]:
#lat_income_results=pd.DataFrame()
#train_results = []
#test_results = []
#for i in range (1,55):
#    train_results.append(lat_income_train.evaluate_tree(i))
#    test_results.append(lat_income_test.evaluate_tree(i))
#
#lat_income_results['Train']=(train_results)
#lat_income_results['Test']=(test_results)
#lat_income_results.plot();

In [13]:
#loc_income_results=pd.DataFrame()
#train_results = []
#test_results = []
#for i in range (1,55):
#    train_results.append(loc_income_train.evaluate_tree(i))
#    test_results.append(loc_income_test.evaluate_tree(i))
#
#loc_income_results['Train']=(train_results)
#loc_income_results['Test']=(test_results)
#loc_income_results.plot();

In [14]:
#all_results=pd.DataFrame()
#train_results = []
#test_results = []
#for i in range (1,100):
#    train_results.append(all_train.evaluate_tree(i))
#    test_results.append(all_test.evaluate_tree(i))
#
#all_results['Train']=(train_results)
#all_results['Test']=(test_results)
#all_results.plot();

## Let's prune this shit.

In [31]:
my_tree = DecisionTreeRegressor(max_depth=2)
my_tree.fit(housing_train[['median_income']],housing_train[['median_house_value']])
children_right = my_tree.tree_.children_right
children_left = my_tree.tree_.children_left
n_nodes = my_tree.tree_.node_count
 
my_tree.tree_.n_node_samples

array([13182,  8767,  4399,  4368,  4415,  3114,  1301])

In [32]:
children_left

array([ 1,  2, -1, -1,  5, -1, -1])

In [24]:
children_right

array([ 4,  3, -1, -1,  6, -1, -1])

In [25]:
tree_frame= pd.DataFrame(my_tree.apply(housing_train[['median_income']]))
tree_frame

Unnamed: 0,0
0,3
1,3
2,3
3,2
4,3
5,5
6,5
7,2
8,2
9,3


In [26]:
actual_value = pd.DataFrame(housing_train[['median_house_value']])
actual_value.reset_index(drop=True, inplace=True)

tree_frame = pd.concat([tree_frame,actual_value],axis=1)
tree_frame

Unnamed: 0,0,median_house_value
0,3,117300.0
1,3,200200.0
2,3,121500.0
3,2,77300.0
4,3,137500.0
5,5,315600.0
6,5,317900.0
7,2,248100.0
8,2,65900.0
9,3,215600.0


In [27]:
pd.value_counts(tree_frame.iloc[:,0])

2    4399
3    4368
5    3114
6    1301
Name: 0, dtype: int64

In [28]:
housing_train.shape

(13182, 14)

In [29]:
c = my_tree.tree_.value.mean()


In [30]:
my_tree.tree_.value

array([[[192345.15976331]],

       [[156719.56165165]],

       [[128038.23528075]],

       [[185604.44139194]],

       [[263087.99524349]],

       [[236452.66506101]],

       [[326840.81475788]]])