In [5]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

In [68]:
df = pd.read_csv("../data/curated/domain_final.csv")

In [69]:
df.loc[df['postcode'] == 3004, 'suburb_sold'] = 596
df.loc[df['postcode'] == 3004, 'suburb_avg_day'] = 147
df.loc[df['postcode'] == 3004, 'suburb_population'] = 6423.0
df.loc[df['postcode'] == 3004, 'suburb_age'] = 20
df.loc[df['postcode'] == 3004, 'suburb_owner'] = 30
df.loc[df['postcode'] == 3004, 'suburb_renter'] = 70
df.loc[df['postcode'] == 3004, 'suburb_family'] = 24
df.loc[df['postcode'] == 3004, 'suburb_single'] = 76

In [70]:
df = df.drop(['Unnamed: 0','uri','name','cost_text','desc','avg_income',
              'coordinates','school_title','SA2_CODE21',
              'school_distance','suburb_age'], axis=1)
df

Unnamed: 0,type,suburb_sold,suburb_avg_day,suburb_population,suburb_owner,suburb_renter,suburb_family,suburb_single,postcode,cost,...,dist_nearest_trainstation,dist_nearest_primary_school,dist_nearest_secondary_school,Beds,Bath,Parking,Median_tot_fam_inc_weekly,Tot_P_P,Median_age_persons,Median_rent_weekly
0,Apartment / Unit / Flat,596.0,147.0,47279.0,30.0,70.0,24.0,76.0,3000,500.0,...,1069.7,380.6,380.6,2.0,1.0,1,2248.0,9848.0,30.0,375.0
1,Apartment / Unit / Flat,387.0,120.0,47279.0,30.0,70.0,24.0,76.0,3000,570.0,...,614.1,491.7,355.7,1.0,1.0,0,2248.0,9848.0,30.0,375.0
2,Apartment / Unit / Flat,596.0,147.0,47279.0,30.0,70.0,24.0,76.0,3000,570.0,...,588.3,1635.2,703.0,2.0,1.0,1,2248.0,9848.0,30.0,375.0
3,Apartment / Unit / Flat,387.0,120.0,47279.0,30.0,70.0,24.0,76.0,3000,475.0,...,1345.1,584.0,584.0,1.0,1.0,1,2248.0,9848.0,30.0,375.0
4,Apartment / Unit / Flat,387.0,120.0,47279.0,30.0,70.0,24.0,76.0,3000,550.0,...,619.3,969.8,497.6,1.0,1.0,0,2248.0,9848.0,30.0,375.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10153,House,81.0,49.0,5431.0,74.0,26.0,56.0,44.0,3996,400.0,...,89540.3,343.0,13431.1,3.0,1.0,2,1466.0,9446.0,54.0,250.0
10154,House,25.0,56.0,5431.0,74.0,26.0,56.0,44.0,3996,420.0,...,89954.5,1009.1,13845.3,2.0,1.0,0,1466.0,9446.0,54.0,250.0
10155,House,81.0,49.0,5431.0,74.0,26.0,56.0,44.0,3996,475.0,...,89143.3,35.3,13034.1,3.0,2.0,1,1466.0,9446.0,54.0,250.0
10156,House,81.0,49.0,5431.0,74.0,26.0,56.0,44.0,3996,400.0,...,89804.8,2704.1,13695.6,3.0,1.0,1,1466.0,9446.0,54.0,250.0


In [71]:
df = df\
                .groupby('postcode') \
                .agg(
                    {
                        'cost':'mean',
                        'Median_rent_weekly':'mean',
                        'dist_nearest_busstop':'mean',
                        'dist_nearest_trainstation':'mean',
                        'dist_nearest_primary_school':'mean',
                        'dist_nearest_secondary_school':'mean',
                        'Beds':'mean',
                        'Bath':'mean',
                        'Parking':'mean',
                        'Tot_P_P':'mean',
                        'Median_age_persons': 'mean',
                        'suburb_sold': 'mean',
                        'suburb_avg_day': 'mean',
                        'suburb_population': 'mean',
                        'suburb_owner': 'mean',
                        'suburb_renter':'mean',
                        'suburb_family':'mean',
                        'suburb_single':'mean',
                    }
                ) 
df.head()

Unnamed: 0_level_0,cost,Median_rent_weekly,dist_nearest_busstop,dist_nearest_trainstation,dist_nearest_primary_school,dist_nearest_secondary_school,Beds,Bath,Parking,Tot_P_P,Median_age_persons,suburb_sold,suburb_avg_day,suburb_population,suburb_owner,suburb_renter,suburb_family,suburb_single
postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
3000,460.439216,375.0,339.496471,727.642353,695.571373,568.037647,1.330709,1.066929,0.113725,9848.0,30.0,443.737255,123.211765,47279.0,30.0,70.0,24.0,76.0
3002,438.421053,480.0,445.3,819.889474,1176.052632,1192.636842,1.263158,1.0,0.684211,4896.0,42.0,51.789474,48.315789,4963.0,45.0,55.0,36.0,64.0
3003,480.394737,0.0,173.639474,705.686842,728.002632,618.268421,1.5,1.157895,0.473684,0.0,0.0,34.605263,109.473684,5498.0,36.0,64.0,27.0,73.0
3004,456.125,457.0,634.43,1505.12,628.39,517.9525,1.325,1.05,0.775,6423.0,39.0,596.0,147.0,6423.0,30.0,70.0,24.0,76.0
3006,482.083333,0.0,458.680208,1521.578125,1300.072917,982.842708,1.5,1.1875,0.489583,0.0,70.0,267.291667,100.385417,18706.0,35.0,65.0,31.0,69.0


In [73]:
#dummies = pd.get_dummies(df.postcode,prefix=['p'])
#df = pd.concat([df,dummies],axis = 1)

#dummies2 = pd.get_dummies(df.type)
#df = pd.concat([df,dummies2],axis = 1)

dummies3 = pd.get_dummies(df.Beds,prefix=['bed'])
df = pd.concat([df,dummies3],axis = 1)

dummies4 = pd.get_dummies(df.Bath,prefix=['bath'])
df = pd.concat([df,dummies4],axis = 1)

dummies4 = pd.get_dummies(df.Parking,prefix=['park'])
df = pd.concat([df,dummies4],axis = 1)


df = df.drop(['Beds','Bath','Parking'], axis=1)







In [74]:
#df[df['suburb_population'].isnull()]['postcode'].value_counts()

In [75]:
#df.loc[(df['postcode'] == 3195)&(df['suburb_population'].isnull())]

In [76]:
df

Unnamed: 0_level_0,cost,Median_rent_weekly,dist_nearest_busstop,dist_nearest_trainstation,dist_nearest_primary_school,dist_nearest_secondary_school,Tot_P_P,Median_age_persons,suburb_sold,suburb_avg_day,...,['park']_2.3333333333333335,['park']_2.3529411764705883,['park']_2.375,['park']_2.4,['park']_2.4285714285714284,['park']_2.5,['park']_3.0,['park']_3.5,['park']_4.0,['park']_6.0
postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3000,460.439216,375.0,339.496471,727.642353,695.571373,568.037647,9848.0,30.0,443.737255,123.211765,...,0,0,0,0,0,0,0,0,0,0
3002,438.421053,480.0,445.300000,819.889474,1176.052632,1192.636842,4896.0,42.0,51.789474,48.315789,...,0,0,0,0,0,0,0,0,0,0
3003,480.394737,0.0,173.639474,705.686842,728.002632,618.268421,0.0,0.0,34.605263,109.473684,...,0,0,0,0,0,0,0,0,0,0
3004,456.125000,457.0,634.430000,1505.120000,628.390000,517.952500,6423.0,39.0,596.000000,147.000000,...,0,0,0,0,0,0,0,0,0,0
3006,482.083333,0.0,458.680208,1521.578125,1300.072917,982.842708,0.0,70.0,267.291667,100.385417,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3980,485.000000,370.0,249.850000,15742.150000,3539.300000,10091.050000,8024.0,40.0,14.500000,13.000000,...,0,0,0,0,0,0,1,0,0,0
3981,425.000000,311.0,637.925000,15714.075000,834.275000,606.050000,9868.0,40.0,38.000000,35.000000,...,0,0,0,0,0,0,0,0,0,0
3984,465.000000,298.0,378.128571,50366.857143,5959.985714,21767.042857,26792.0,51.0,21.857143,41.714286,...,0,0,0,0,0,0,0,0,0,0
3995,427.000000,298.0,426.615000,77790.330000,1790.670000,2181.715000,26792.0,51.0,43.500000,42.600000,...,0,0,0,0,0,0,0,0,0,0


In [77]:
for c in df.columns:
    if df[c].isnull().values.any():
        print(c)

In [78]:
df.isnull().sum()

cost                           0
Median_rent_weekly             0
dist_nearest_busstop           0
dist_nearest_trainstation      0
dist_nearest_primary_school    0
                              ..
['park']_2.5                   0
['park']_3.0                   0
['park']_3.5                   0
['park']_4.0                   0
['park']_6.0                   0
Length: 1123, dtype: int64

In [79]:
target= np.array(df['cost'])
features = df.drop('cost', axis = 1)
feature_list = list(features.columns)
features = np.array(features)

In [80]:
kf = KFold(n_splits=10,random_state=42,shuffle=True)
accuracies = []
for train_index, test_index in kf.split(features):

    data_train   = features[train_index]
    target_train = target[train_index]

    data_test    = features[test_index]
    target_test  = target[test_index]

    rf = RandomForestRegressor(n_estimators = 1000, 
                               random_state = 42, 
                               criterion='squared_error',
                               bootstrap=True)
    
    rf.fit(data_train, target_train)

    predictions = rf.predict(data_test)

    errors = abs(predictions - target_test)

    print('Mean Absolute Error:', round(np.mean(errors), 2))
    
    mape = 100 * (errors / target_test)
    accuracy = 100 - np.mean(mape)
    print('Accuracy:', round(accuracy, 2), '%.')

    accuracies.append(accuracy)

average_accuracy = np.mean(accuracies)
print('Average accuracy:', average_accuracy)

KeyboardInterrupt: 

In [None]:

from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

In [None]:
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(data_train, target_train)

print(rf_random.best_params_)

In [None]:
kf = KFold(n_splits=10,random_state=42,shuffle=True)
accuracies = []
for train_index, test_index in kf.split(features):

    data_train   = features[train_index]
    target_train = target[train_index]

    data_test    = features[test_index]
    target_test  = target[test_index]

    rf = RandomForestRegressor(
                               n_estimators = 600, 
                               min_samples_split = 5, 
                               min_samples_leaf = 1, 
                               max_features = 'sqrt', 
                               max_depth = 60, 
                               bootstrap = False)
    
    rf.fit(data_train, target_train)

    predictions = rf.predict(data_test)

    errors = abs(predictions - target_test)

    print('Mean Absolute Error:', round(np.mean(errors), 2))
    
    mape = 100 * (errors / target_test)
    accuracy = 100 - np.mean(mape)
    print('Accuracy:', round(accuracy, 2), '%.')

    accuracies.append(accuracy)

average_accuracy = np.mean(accuracies)
print('Average accuracy:', average_accuracy)

In [None]:
y = rf.feature_importances_
list_y = [a for a in y if a > 0.005]

list_y.sort()

list_y = list_y[:20]

list_of_index = []
for i in list_y:
    a = np.where(y==i)
    list_of_index.append(a)
    
index = []
for i in range(int(len(list_of_index))):
    a = list_of_index[i][0][0]
    index.append(a)

col = []
for i in feature_list:
    col.append(i)
labels = []
for i in index:
    b = col[i]
    labels.append(b)

In [None]:
labels

In [None]:
index

In [17]:
list_y.sort()

NameError: name 'list_y' is not defined

In [22]:
list1.sort()