In [1]:
#libraries
import pandas as pd
import numpy as np

In [2]:
#Import data

DF = pd.read_json("../data/Rental_Data/rentalTrend.json")

DF.head()

Unnamed: 0,FSA,bathrooms,bedrooms,description,furnished,id,image,lat,long,pet_friendly,post_published_date,postal_code,price,rental_type,source,sqft,title,url
0,M5B,0,1,1BR / 0Ba furnished apartment/ 1br -Brand new ...,True,c_7195819164,,43.6572,-79.3783,False,2020-09-21,M5B 1Y2,995,apartment,craigslist,,1bedroom+1den near Eaton Center,https://toronto.craigslist.org/tor/apa/d/toron...
1,M5R,0,1,1BR / 0Ba furnished apartment/ 1br -Spectacula...,True,c_7195700072,,43.6736,-79.4035,False,2020-09-21,M5R 2R8,1220,condo,craigslist,,Top Floor Condo For Rent,https://toronto.craigslist.org/tor/apa/d/centr...
2,M6H,2,2,2BR / 2Ba 1100ft2 available nov 1 cats are OK ...,,c_7200564066,https://images.craigslist.org/00202_iyRnxvT5sc...,43.6655,-79.4378,True,2020-09-21,M6H 4B9,3000,townhouse,craigslist,1100.0,Contemporary 2+1 bedroom townhouse w/ parking,https://toronto.craigslist.org/tor/apa/d/west-...
3,M5V,2,2,2BR / 2Ba available nov 15 loft w/d in unit at...,,c_7196251096,https://images.craigslist.org/00O0O_fAJia8pnVQ...,43.647329,-79.395794,False,2020-09-21,M5V,4500,loft,craigslist,,RICHMOND/SPADINA-2 BED/2 BATH/1 PARKING-CAMDEN ST,https://toronto.craigslist.org/tor/apa/d/toron...
4,M6G,0,1,1BR / 0Ba furnished apartment/ 1br -Newly reno...,True,c_7195818373,,43.6683,-79.4205,False,2020-09-21,M6G 3B4,977,apartment,craigslist,,1BDR Downtown Toronto apartment unit for rent,https://toronto.craigslist.org/tor/apa/d/downt...


In [3]:
DF.to_csv("../data/Rental_Data/rentalTrend.csv")

In [4]:
df = pd.read_csv("../data/Rental_Data/rentalTrend.csv")

In [5]:
#view missing data
df.isna().sum()

Unnamed: 0                0
FSA                      24
bathrooms               289
bedrooms                301
description               1
furnished              7982
id                        0
image                   670
lat                       0
long                      0
pet_friendly              0
post_published_date       0
postal_code              31
price                     0
rental_type               1
source                    0
sqft                   5149
title                     0
url                       0
dtype: int64

In [6]:
#Replace all white spaces or nothing at all to NaN
df.replace(r'^\s*$', np.nan, regex=True, inplace=True)
#Replace None with NaN
df = df.fillna(value=np.nan)

In [7]:
#update formatting

#int
df['price'] = df['price'].astype('int')

#Coding/blanks
df['furnished'] = df['furnished'].fillna(value="NOT_MENTIONED")
df['furnished'] = df['furnished'].replace(to_replace=True, value='YES')
df['furnished'] = df['furnished'].replace(to_replace=False, value='NO')
df['image'] = df['image'].notna()

df = df[(df['price']<=10000) & (df['price']>=100)]
df.drop(df[(df['sqft']<200) | (df['sqft']>3000)].index, axis=0, inplace=True)

In [8]:
#Convert post_published_date to week of the month
from datetime import datetime
df['post_published_date'] = df['post_published_date'].map(lambda x: datetime.strptime(x, '%Y-%m-%d'))
#New features
df['posted_week_of_month'] = df['post_published_date'].map(lambda x: x.day//7 +1)

In [9]:
#view no. blanks
df.isna().sum()

Unnamed: 0                 0
FSA                       23
bathrooms                281
bedrooms                 294
description                1
furnished                  0
id                         0
image                      0
lat                        0
long                       0
pet_friendly               0
post_published_date        0
postal_code               30
price                      0
rental_type                0
source                     0
sqft                    5099
title                      0
url                        0
posted_week_of_month       0
dtype: int64

In [10]:
#drop uniformative variables
df = df.drop(['description', 'title', 'url', 'source', 'postal_code', "Unnamed: 0"], axis=1)

In [11]:
#drop nulls of mandatory columns and clean
df.drop(df[df[['FSA', 'rental_type', 'bedrooms']].isna().any(axis=1)].index, inplace=True)

df = df[df['FSA'].str.startswith('M')]

In [12]:
df.isna().sum()

FSA                        0
bathrooms                 22
bedrooms                   0
furnished                  0
id                         0
image                      0
lat                        0
long                       0
pet_friendly               0
post_published_date        0
price                      0
rental_type                0
sqft                    4771
posted_week_of_month       0
dtype: int64

In [13]:
df.reset_index(inplace=True)

df = df.drop("index", axis=1)

In [14]:
df.head()

Unnamed: 0,FSA,bathrooms,bedrooms,furnished,id,image,lat,long,pet_friendly,post_published_date,price,rental_type,sqft,posted_week_of_month
0,M5B,0.0,1.0,YES,c_7195819164,False,43.6572,-79.3783,False,2020-09-21,995,apartment,,4
1,M5R,0.0,1.0,YES,c_7195700072,False,43.6736,-79.4035,False,2020-09-21,1220,condo,,4
2,M6H,2.0,2.0,NOT_MENTIONED,c_7200564066,True,43.6655,-79.4378,True,2020-09-21,3000,townhouse,1100.0,4
3,M5V,2.0,2.0,NOT_MENTIONED,c_7196251096,True,43.647329,-79.395794,False,2020-09-21,4500,loft,,4
4,M6G,0.0,1.0,YES,c_7195818373,False,43.6683,-79.4205,False,2020-09-21,977,apartment,,4


In [16]:
cat_columns = ['FSA', 'image', 'rental_type', 'furnished', 'pet_friendly']
num_columns = ['sqft', 'bedrooms', 'bathrooms', 'posted_week_of_month']
X= df[num_columns + cat_columns]
y=df['price']

In [17]:
#convert cat variables into boolean
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
##OHE
enc = OneHotEncoder()
##OHE Fit
enc.fit(X[cat_columns])
##OHE Transform
ohe_output = enc.transform(X[cat_columns]).toarray()
##Retrieve OHE labels
ohe_dict = {f'x{index}':col for index,col in enumerate(cat_columns)}
ohe_labels = [ohe_dict[feature.split('_')[0]]+'_'+feature.split('_')[1] for feature in enc.get_feature_names()]
X = pd.concat([X, pd.DataFrame(ohe_output, columns=ohe_labels)], axis=1)
X.drop(cat_columns, axis=1, inplace=True)

In [18]:
X

Unnamed: 0,sqft,bedrooms,bathrooms,posted_week_of_month,FSA_M1B,FSA_M1C,FSA_M1E,FSA_M1G,FSA_M1H,FSA_M1J,...,rental_type_house,rental_type_land,rental_type_loft,rental_type_suite,rental_type_townhouse,furnished_NO,furnished_NOT,furnished_YES,pet_friendly_False,pet_friendly_True
0,,1.0,0.0,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,,1.0,0.0,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,1100.0,2.0,2.0,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
3,,2.0,2.0,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,,1.0,0.0,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10902,,2.0,1.0,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
10903,,1.0,1.0,4,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
10904,,2.0,1.0,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
10905,,3.0,1.0,4,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [19]:
#normalize data

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)
df_scaled

Unnamed: 0,sqft,bedrooms,bathrooms,posted_week_of_month,FSA_M1B,FSA_M1C,FSA_M1E,FSA_M1G,FSA_M1H,FSA_M1J,...,rental_type_house,rental_type_land,rental_type_loft,rental_type_suite,rental_type_townhouse,furnished_NO,furnished_NOT,furnished_YES,pet_friendly_False,pet_friendly_True
0,,0.125,0.000000,0.75,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,,0.125,0.000000,0.75,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,0.321429,0.250,0.235294,0.75,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
3,,0.250,0.235294,0.75,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,,0.125,0.000000,0.75,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10902,,0.250,0.117647,0.75,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
10903,,0.125,0.117647,0.75,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
10904,,0.250,0.117647,0.75,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
10905,,0.375,0.117647,0.75,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [20]:
from sklearn.impute import KNNImputer

In [295]:
#convert cat columns to boolean
#cat_columns = ['image', 'rental_type', 'furnished', 'pet_friendly']

#cat_dummies = pd.get_dummies(cat_columns, drop_first=True)
#cat_dummies.head()

In [296]:
#df_hot = df.drop(['image', 'FSA', 'rental_type', 'furnished', 'pet_friendly'], axis=1)
#df_hot = pd.concat([df_hot, cat_dummies], axis=1)
#df_hot.head()

In [297]:
#df_hot.drop(columns=["id"], inplace=True)

In [298]:
#df_hot.drop(columns=["post_published_date"], inplace=True)

In [21]:
#impute missing non-mandatory values
imputer = KNNImputer(n_neighbors=3)
X = pd.DataFrame(imputer.fit_transform(df_scaled),columns = df_scaled.columns)

In [22]:
X.head()

Unnamed: 0,sqft,bedrooms,bathrooms,posted_week_of_month,FSA_M1B,FSA_M1C,FSA_M1E,FSA_M1G,FSA_M1H,FSA_M1J,...,rental_type_house,rental_type_land,rental_type_loft,rental_type_suite,rental_type_townhouse,furnished_NO,furnished_NOT,furnished_YES,pet_friendly_False,pet_friendly_True
0,0.163929,0.125,0.0,0.75,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,0.229167,0.125,0.0,0.75,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,0.321429,0.25,0.235294,0.75,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
3,0.274881,0.25,0.235294,0.75,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0.129762,0.125,0.0,0.75,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [402]:
#verify
X.isna().sum().sum()

0

In [23]:
#setup ML modelling
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

In [24]:
#split into train vs not train
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# scan for all variables 
All = DecisionTreeRegressor()
All = All.fit(X_train, y_train)
All.score(X_train, y_train)
All.score(X_test, y_test)


0.5145443376843262

In [25]:
top_35 = ['sqft', 'bedrooms', 'bathrooms', 'image_False', 'FSA_M1B', 'FSA_M1M',
       'FSA_M1P', 'FSA_M1V', 'FSA_M1W', 'FSA_M2M', 'FSA_M3C', 'FSA_M3K',
       'FSA_M4E', 'FSA_M4V', 'FSA_M4W', 'FSA_M5G', 'FSA_M5J', 'FSA_M5R',
       'FSA_M5S', 'FSA_M5V', 'FSA_M6B', 'FSA_M6E', 'FSA_M6G', 'FSA_M6J',
       'FSA_M6K', 'FSA_M6M', 'FSA_M6P', 'rental_type_apartment',
       'rental_type_condo', 'rental_type_house', 'rental_type_loft',
       'rental_type_townhouse', 'furnished_NOT', 'furnished_YES',
       'pet_friendly_False']

In [26]:
X = X[['sqft', 'bedrooms', 'bathrooms', 'image_False', 'FSA_M1B', 'FSA_M1M',
       'FSA_M1P', 'FSA_M1V', 'FSA_M1W', 'FSA_M2M', 'FSA_M3C', 'FSA_M3K',
       'FSA_M4E', 'FSA_M4V', 'FSA_M4W', 'FSA_M5G', 'FSA_M5J', 'FSA_M5R',
       'FSA_M5S', 'FSA_M5V', 'FSA_M6B', 'FSA_M6E', 'FSA_M6G', 'FSA_M6J',
       'FSA_M6K', 'FSA_M6M', 'FSA_M6P', 'rental_type_apartment',
       'rental_type_condo', 'rental_type_house', 'rental_type_loft',
       'rental_type_townhouse', 'furnished_NOT', 'furnished_YES',
       'pet_friendly_False']]

In [27]:
X.head()

Unnamed: 0,sqft,bedrooms,bathrooms,image_False,FSA_M1B,FSA_M1M,FSA_M1P,FSA_M1V,FSA_M1W,FSA_M2M,...,FSA_M6M,FSA_M6P,rental_type_apartment,rental_type_condo,rental_type_house,rental_type_loft,rental_type_townhouse,furnished_NOT,furnished_YES,pet_friendly_False
0,0.163929,0.125,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,0.229167,0.125,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
2,0.321429,0.25,0.235294,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,0.274881,0.25,0.235294,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,0.129762,0.125,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [28]:
#split into train vs not train
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=32)

# scan accuracy with top 35 variables/no parameter optimization
topNoOpt = DecisionTreeRegressor()
topNoOpt = topNoOpt.fit(X_train, y_train)
topNoOpt.score(X_train, y_train)

0.9517944694281891

In [30]:
topNoOpt.score(X_test, y_test)

0.5548327758565973

In [34]:
#setup gridsearch 
from sklearn.model_selection import GridSearchCV
params = {'max_leaf_nodes': [190, 200, 205, 210, 215, 220], 'min_samples_split': [85, 90, 95, 100], 'max_depth': [30, 35, 40, 45, 50]}

grid_search_cv = GridSearchCV(DecisionTreeRegressor(random_state=99), params, cv=5, scoring = 'r2', verbose=5)

In [35]:
#scan for best parameters
grid_search_cv.fit(X_train, y_train)

Fitting 5 folds for each of 120 candidates, totalling 600 fits
[CV] max_depth=30, max_leaf_nodes=190, min_samples_split=85 ..........
[CV]  max_depth=30, max_leaf_nodes=190, min_samples_split=85, score=0.529, total=   0.0s
[CV] max_depth=30, max_leaf_nodes=190, min_samples_split=85 ..........
[CV]  max_depth=30, max_leaf_nodes=190, min_samples_split=85, score=0.620, total=   0.0s
[CV] max_depth=30, max_leaf_nodes=190, min_samples_split=85 ..........
[CV]  max_depth=30, max_leaf_nodes=190, min_samples_split=85, score=0.433, total=   0.0s
[CV] max_depth=30, max_leaf_nodes=190, min_samples_split=85 ..........
[CV]  max_depth=30, max_leaf_nodes=190, min_samples_split=85, score=0.517, total=   0.0s
[CV] max_depth=30, max_leaf_nodes=190, min_samples_split=85 ..........
[CV]  max_depth=30, max_leaf_nodes=190, min_samples_split=85, score=0.609, total=   0.0s
[CV] max_depth=30, max_leaf_nodes=190, min_samples_split=90 ..........
[CV]  max_depth=30, max_leaf_nodes=190, min_samples_split=90, scor

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s


[CV]  max_depth=30, max_leaf_nodes=190, min_samples_split=90, score=0.437, total=   0.0s
[CV] max_depth=30, max_leaf_nodes=190, min_samples_split=90 ..........
[CV]  max_depth=30, max_leaf_nodes=190, min_samples_split=90, score=0.521, total=   0.0s
[CV] max_depth=30, max_leaf_nodes=190, min_samples_split=90 ..........
[CV]  max_depth=30, max_leaf_nodes=190, min_samples_split=90, score=0.607, total=   0.0s
[CV] max_depth=30, max_leaf_nodes=190, min_samples_split=95 ..........
[CV]  max_depth=30, max_leaf_nodes=190, min_samples_split=95, score=0.524, total=   0.0s
[CV] max_depth=30, max_leaf_nodes=190, min_samples_split=95 ..........
[CV]  max_depth=30, max_leaf_nodes=190, min_samples_split=95, score=0.612, total=   0.0s
[CV] max_depth=30, max_leaf_nodes=190, min_samples_split=95 ..........
[CV]  max_depth=30, max_leaf_nodes=190, min_samples_split=95, score=0.437, total=   0.0s
[CV] max_depth=30, max_leaf_nodes=190, min_samples_split=95 ..........
[CV]  max_depth=30, max_leaf_nodes=190, 

[CV]  max_depth=30, max_leaf_nodes=205, min_samples_split=100, score=0.605, total=   0.0s
[CV] max_depth=30, max_leaf_nodes=210, min_samples_split=85 ..........
[CV]  max_depth=30, max_leaf_nodes=210, min_samples_split=85, score=0.530, total=   0.0s
[CV] max_depth=30, max_leaf_nodes=210, min_samples_split=85 ..........
[CV]  max_depth=30, max_leaf_nodes=210, min_samples_split=85, score=0.621, total=   0.0s
[CV] max_depth=30, max_leaf_nodes=210, min_samples_split=85 ..........
[CV]  max_depth=30, max_leaf_nodes=210, min_samples_split=85, score=0.433, total=   0.0s
[CV] max_depth=30, max_leaf_nodes=210, min_samples_split=85 ..........
[CV]  max_depth=30, max_leaf_nodes=210, min_samples_split=85, score=0.519, total=   0.0s
[CV] max_depth=30, max_leaf_nodes=210, min_samples_split=85 ..........
[CV]  max_depth=30, max_leaf_nodes=210, min_samples_split=85, score=0.610, total=   0.0s
[CV] max_depth=30, max_leaf_nodes=210, min_samples_split=90 ..........
[CV]  max_depth=30, max_leaf_nodes=210,

[CV]  max_depth=30, max_leaf_nodes=220, min_samples_split=100, score=0.605, total=   0.0s
[CV] max_depth=35, max_leaf_nodes=190, min_samples_split=85 ..........
[CV]  max_depth=35, max_leaf_nodes=190, min_samples_split=85, score=0.529, total=   0.0s
[CV] max_depth=35, max_leaf_nodes=190, min_samples_split=85 ..........
[CV]  max_depth=35, max_leaf_nodes=190, min_samples_split=85, score=0.620, total=   0.0s
[CV] max_depth=35, max_leaf_nodes=190, min_samples_split=85 ..........
[CV]  max_depth=35, max_leaf_nodes=190, min_samples_split=85, score=0.433, total=   0.0s
[CV] max_depth=35, max_leaf_nodes=190, min_samples_split=85 ..........
[CV]  max_depth=35, max_leaf_nodes=190, min_samples_split=85, score=0.517, total=   0.0s
[CV] max_depth=35, max_leaf_nodes=190, min_samples_split=85 ..........
[CV]  max_depth=35, max_leaf_nodes=190, min_samples_split=85, score=0.609, total=   0.0s
[CV] max_depth=35, max_leaf_nodes=190, min_samples_split=90 ..........
[CV]  max_depth=35, max_leaf_nodes=190,

[CV]  max_depth=35, max_leaf_nodes=205, min_samples_split=95, score=0.610, total=   0.0s
[CV] max_depth=35, max_leaf_nodes=205, min_samples_split=95 ..........
[CV]  max_depth=35, max_leaf_nodes=205, min_samples_split=95, score=0.437, total=   0.0s
[CV] max_depth=35, max_leaf_nodes=205, min_samples_split=95 ..........
[CV]  max_depth=35, max_leaf_nodes=205, min_samples_split=95, score=0.523, total=   0.0s
[CV] max_depth=35, max_leaf_nodes=205, min_samples_split=95 ..........
[CV]  max_depth=35, max_leaf_nodes=205, min_samples_split=95, score=0.605, total=   0.0s
[CV] max_depth=35, max_leaf_nodes=205, min_samples_split=100 .........
[CV]  max_depth=35, max_leaf_nodes=205, min_samples_split=100, score=0.524, total=   0.0s
[CV] max_depth=35, max_leaf_nodes=205, min_samples_split=100 .........
[CV]  max_depth=35, max_leaf_nodes=205, min_samples_split=100, score=0.616, total=   0.0s
[CV] max_depth=35, max_leaf_nodes=205, min_samples_split=100 .........
[CV]  max_depth=35, max_leaf_nodes=205

[CV]  max_depth=35, max_leaf_nodes=220, min_samples_split=95, score=0.524, total=   0.0s
[CV] max_depth=35, max_leaf_nodes=220, min_samples_split=95 ..........
[CV]  max_depth=35, max_leaf_nodes=220, min_samples_split=95, score=0.610, total=   0.0s
[CV] max_depth=35, max_leaf_nodes=220, min_samples_split=95 ..........
[CV]  max_depth=35, max_leaf_nodes=220, min_samples_split=95, score=0.437, total=   0.0s
[CV] max_depth=35, max_leaf_nodes=220, min_samples_split=95 ..........
[CV]  max_depth=35, max_leaf_nodes=220, min_samples_split=95, score=0.523, total=   0.0s
[CV] max_depth=35, max_leaf_nodes=220, min_samples_split=95 ..........
[CV]  max_depth=35, max_leaf_nodes=220, min_samples_split=95, score=0.605, total=   0.0s
[CV] max_depth=35, max_leaf_nodes=220, min_samples_split=100 .........
[CV]  max_depth=35, max_leaf_nodes=220, min_samples_split=100, score=0.524, total=   0.0s
[CV] max_depth=35, max_leaf_nodes=220, min_samples_split=100 .........
[CV]  max_depth=35, max_leaf_nodes=220,

[CV]  max_depth=40, max_leaf_nodes=205, min_samples_split=85, score=0.621, total=   0.0s
[CV] max_depth=40, max_leaf_nodes=205, min_samples_split=85 ..........
[CV]  max_depth=40, max_leaf_nodes=205, min_samples_split=85, score=0.433, total=   0.0s
[CV] max_depth=40, max_leaf_nodes=205, min_samples_split=85 ..........
[CV]  max_depth=40, max_leaf_nodes=205, min_samples_split=85, score=0.518, total=   0.0s
[CV] max_depth=40, max_leaf_nodes=205, min_samples_split=85 ..........
[CV]  max_depth=40, max_leaf_nodes=205, min_samples_split=85, score=0.610, total=   0.0s
[CV] max_depth=40, max_leaf_nodes=205, min_samples_split=90 ..........
[CV]  max_depth=40, max_leaf_nodes=205, min_samples_split=90, score=0.527, total=   0.0s
[CV] max_depth=40, max_leaf_nodes=205, min_samples_split=90 ..........
[CV]  max_depth=40, max_leaf_nodes=205, min_samples_split=90, score=0.621, total=   0.0s
[CV] max_depth=40, max_leaf_nodes=205, min_samples_split=90 ..........
[CV]  max_depth=40, max_leaf_nodes=205, 

[CV]  max_depth=40, max_leaf_nodes=220, min_samples_split=85, score=0.530, total=   0.0s
[CV] max_depth=40, max_leaf_nodes=220, min_samples_split=85 ..........
[CV]  max_depth=40, max_leaf_nodes=220, min_samples_split=85, score=0.621, total=   0.0s
[CV] max_depth=40, max_leaf_nodes=220, min_samples_split=85 ..........
[CV]  max_depth=40, max_leaf_nodes=220, min_samples_split=85, score=0.433, total=   0.0s
[CV] max_depth=40, max_leaf_nodes=220, min_samples_split=85 ..........
[CV]  max_depth=40, max_leaf_nodes=220, min_samples_split=85, score=0.518, total=   0.0s
[CV] max_depth=40, max_leaf_nodes=220, min_samples_split=85 ..........
[CV]  max_depth=40, max_leaf_nodes=220, min_samples_split=85, score=0.609, total=   0.0s
[CV] max_depth=40, max_leaf_nodes=220, min_samples_split=90 ..........
[CV]  max_depth=40, max_leaf_nodes=220, min_samples_split=90, score=0.527, total=   0.0s
[CV] max_depth=40, max_leaf_nodes=220, min_samples_split=90 ..........
[CV]  max_depth=40, max_leaf_nodes=220, 

[CV]  max_depth=45, max_leaf_nodes=200, min_samples_split=100, score=0.616, total=   0.0s
[CV] max_depth=45, max_leaf_nodes=200, min_samples_split=100 .........
[CV]  max_depth=45, max_leaf_nodes=200, min_samples_split=100, score=0.435, total=   0.0s
[CV] max_depth=45, max_leaf_nodes=200, min_samples_split=100 .........
[CV]  max_depth=45, max_leaf_nodes=200, min_samples_split=100, score=0.528, total=   0.0s
[CV] max_depth=45, max_leaf_nodes=200, min_samples_split=100 .........
[CV]  max_depth=45, max_leaf_nodes=200, min_samples_split=100, score=0.605, total=   0.0s
[CV] max_depth=45, max_leaf_nodes=205, min_samples_split=85 ..........
[CV]  max_depth=45, max_leaf_nodes=205, min_samples_split=85, score=0.530, total=   0.0s
[CV] max_depth=45, max_leaf_nodes=205, min_samples_split=85 ..........
[CV]  max_depth=45, max_leaf_nodes=205, min_samples_split=85, score=0.621, total=   0.0s
[CV] max_depth=45, max_leaf_nodes=205, min_samples_split=85 ..........
[CV]  max_depth=45, max_leaf_nodes=2

[CV]  max_depth=45, max_leaf_nodes=215, min_samples_split=95, score=0.523, total=   0.0s
[CV] max_depth=45, max_leaf_nodes=215, min_samples_split=95 ..........
[CV]  max_depth=45, max_leaf_nodes=215, min_samples_split=95, score=0.605, total=   0.0s
[CV] max_depth=45, max_leaf_nodes=215, min_samples_split=100 .........
[CV]  max_depth=45, max_leaf_nodes=215, min_samples_split=100, score=0.524, total=   0.0s
[CV] max_depth=45, max_leaf_nodes=215, min_samples_split=100 .........
[CV]  max_depth=45, max_leaf_nodes=215, min_samples_split=100, score=0.615, total=   0.0s
[CV] max_depth=45, max_leaf_nodes=215, min_samples_split=100 .........
[CV]  max_depth=45, max_leaf_nodes=215, min_samples_split=100, score=0.435, total=   0.0s
[CV] max_depth=45, max_leaf_nodes=215, min_samples_split=100 .........
[CV]  max_depth=45, max_leaf_nodes=215, min_samples_split=100, score=0.528, total=   0.0s
[CV] max_depth=45, max_leaf_nodes=215, min_samples_split=100 .........
[CV]  max_depth=45, max_leaf_nodes=2

[CV]  max_depth=50, max_leaf_nodes=200, min_samples_split=90, score=0.438, total=   0.0s
[CV] max_depth=50, max_leaf_nodes=200, min_samples_split=90 ..........
[CV]  max_depth=50, max_leaf_nodes=200, min_samples_split=90, score=0.522, total=   0.0s
[CV] max_depth=50, max_leaf_nodes=200, min_samples_split=90 ..........
[CV]  max_depth=50, max_leaf_nodes=200, min_samples_split=90, score=0.607, total=   0.0s
[CV] max_depth=50, max_leaf_nodes=200, min_samples_split=95 ..........
[CV]  max_depth=50, max_leaf_nodes=200, min_samples_split=95, score=0.524, total=   0.0s
[CV] max_depth=50, max_leaf_nodes=200, min_samples_split=95 ..........
[CV]  max_depth=50, max_leaf_nodes=200, min_samples_split=95, score=0.611, total=   0.0s
[CV] max_depth=50, max_leaf_nodes=200, min_samples_split=95 ..........
[CV]  max_depth=50, max_leaf_nodes=200, min_samples_split=95, score=0.437, total=   0.0s
[CV] max_depth=50, max_leaf_nodes=200, min_samples_split=95 ..........
[CV]  max_depth=50, max_leaf_nodes=200, 

[CV]  max_depth=50, max_leaf_nodes=215, min_samples_split=85, score=0.610, total=   0.0s
[CV] max_depth=50, max_leaf_nodes=215, min_samples_split=90 ..........
[CV]  max_depth=50, max_leaf_nodes=215, min_samples_split=90, score=0.527, total=   0.0s
[CV] max_depth=50, max_leaf_nodes=215, min_samples_split=90 ..........
[CV]  max_depth=50, max_leaf_nodes=215, min_samples_split=90, score=0.620, total=   0.0s
[CV] max_depth=50, max_leaf_nodes=215, min_samples_split=90 ..........
[CV]  max_depth=50, max_leaf_nodes=215, min_samples_split=90, score=0.438, total=   0.0s
[CV] max_depth=50, max_leaf_nodes=215, min_samples_split=90 ..........
[CV]  max_depth=50, max_leaf_nodes=215, min_samples_split=90, score=0.522, total=   0.0s
[CV] max_depth=50, max_leaf_nodes=215, min_samples_split=90 ..........
[CV]  max_depth=50, max_leaf_nodes=215, min_samples_split=90, score=0.607, total=   0.0s
[CV] max_depth=50, max_leaf_nodes=215, min_samples_split=95 ..........
[CV]  max_depth=50, max_leaf_nodes=215, 

[Parallel(n_jobs=1)]: Done 600 out of 600 | elapsed:   15.3s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse',
                                             max_depth=None, max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             presort='deprecated',
                                             random_state=99, splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': [30, 35, 40, 45, 50],
                         'max_leaf_nodes': [190, 200, 205, 210, 215, 220],
                         'min_samples_split': [85, 90, 95, 100]},
 

In [36]:
grid_search_cv.best_estimator_

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=35,
                      max_features=None, max_leaf_nodes=200,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=90,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=99, splitter='best')

In [37]:
#split into train vs not train
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=56)

# scan accuracy with top 35 variables with parameter optimization
topOpt = DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=35,
                      max_features=None, max_leaf_nodes=200,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=90,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=99, splitter='best')

topOpt = topOpt.fit(X_train, y_train)
topOpt.score(X_train, y_train)

0.674828265144982

In [38]:
topOpt.score(X_test, y_test)

0.6240732330279787