In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

In [None]:
df = pd.read_csv("../data/curated/domain_final.csv")

In [None]:
df.loc[df['postcode'] == 3004, 'suburb_sold'] = 596
df.loc[df['postcode'] == 3004, 'suburb_avg_day'] = 147
df.loc[df['postcode'] == 3004, 'suburb_population'] = 6423.0
df.loc[df['postcode'] == 3004, 'suburb_age'] = 20
df.loc[df['postcode'] == 3004, 'suburb_owner'] = 30
df.loc[df['postcode'] == 3004, 'suburb_renter'] = 70
df.loc[df['postcode'] == 3004, 'suburb_family'] = 24
df.loc[df['postcode'] == 3004, 'suburb_single'] = 76

In [None]:
df = df.drop(['Unnamed: 0','uri','name','cost_text','desc',
              'coordinates','school_title','SA2_CODE21','Tot_P_P','avg_income','school_distance'], axis=1)
df

In [None]:
dummies = pd.get_dummies(df.postcode,prefix=['p'])
df = pd.concat([df,dummies],axis = 1)

dummies2 = pd.get_dummies(df.type)
df = pd.concat([df,dummies2],axis = 1)

dummies3 = pd.get_dummies(df.Beds,prefix=['bed'])
df = pd.concat([df,dummies3],axis = 1)

dummies4 = pd.get_dummies(df.Bath,prefix=['bath'])
df = pd.concat([df,dummies4],axis = 1)

dummies4 = pd.get_dummies(df.Parking,prefix=['park'])
df = pd.concat([df,dummies4],axis = 1)


df = df.drop(['Beds','Bath','Parking','type','postcode'], axis=1)







In [None]:
#df[df['suburb_population'].isnull()]['postcode'].value_counts()

In [None]:
#df.loc[(df['postcode'] == 3195)&(df['suburb_population'].isnull())]

In [None]:
df

In [None]:
for c in df.columns:
    if df[c].isnull().values.any():
        print(c)

In [None]:
df[df['suburb_sold'].isnull()]['postcode'].value_counts()

In [None]:
df.isnull().sum()

In [None]:
df[df['suburb_population'].isnull()]['postcode'].value_counts()

In [None]:
target= np.array(df['cost'])
features = df.drop('cost', axis = 1)
feature_list = list(features.columns)
features = np.array(features)

In [None]:
kf = KFold(n_splits=10,random_state=42,shuffle=True)
accuracies = []
for train_index, test_index in kf.split(features):

    data_train   = features[train_index]
    target_train = target[train_index]

    data_test    = features[test_index]
    target_test  = target[test_index]

    rf = RandomForestRegressor(n_estimators = 1000, 
                               random_state = 42, 
                               criterion='mse',
                               bootstrap=True)
    
    rf.fit(data_train, target_train)

    predictions = rf.predict(data_test)

    errors = abs(predictions - target_test)

    print('Mean Absolute Error:', round(np.mean(errors), 2))
    
    mape = 100 * (errors / target_test)
    accuracy = 100 - np.mean(mape)
    print('Accuracy:', round(accuracy, 2), '%.')

    accuracies.append(accuracy)

average_accuracy = np.mean(accuracies)
print('Average accuracy:', average_accuracy)

In [None]:

from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

In [None]:
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 50, cv = 2, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(data_train, target_train)

print(rf_random.best_params_)

In [None]:
kf = KFold(n_splits=10,random_state=42,shuffle=True)
accuracies = []
for train_index, test_index in kf.split(features):

    data_train   = features[train_index]
    target_train = target[train_index]

    data_test    = features[test_index]
    target_test  = target[test_index]

    rf = RandomForestRegressor(
                               n_estimators = 600, 
                               min_samples_split = 5, 
                               min_samples_leaf = 1, 
                               max_features = 'sqrt', 
                               max_depth = 60, 
                               bootstrap = False)
    
    rf.fit(data_train, target_train)

    predictions = rf.predict(data_test)

    errors = abs(predictions - target_test)

    print('Mean Absolute Error:', round(np.mean(errors), 2))
    
    mape = 100 * (errors / target_test)
    accuracy = 100 - np.mean(mape)
    print('Accuracy:', round(accuracy, 2), '%.')

    accuracies.append(accuracy)

average_accuracy = np.mean(accuracies)
print('Average accuracy:', average_accuracy)

In [88]:
y = rf.feature_importances_
list_y = [a for a in y if a > 0.005]

list_of_index = []
for i in list_y:
    a = np.where(y==i)
    list_of_index.append(a)
    
index = []
for i in range(int(len(list_of_index))):
    a = list_of_index[i][0][0]
    index.append(a)

col = []
for i in feature_list:
    col.append(i)
labels = []
for i in index:
    b = col[i]
    labels.append(b)

In [89]:
labels

['suburb_sold',
 'suburb_avg_day',
 'suburb_population',
 'suburb_age',
 'suburb_owner',
 'suburb_renter',
 'suburb_family',
 'suburb_single',
 'postcode',
 'dist_nearest_busstop',
 'dist_nearest_trainstation',
 'dist_nearest_primary_school',
 'dist_nearest_secondary_school',
 'Median_tot_fam_inc_weekly',
 'Median_age_persons',
 'Median_rent_weekly',
 'Apartment / Unit / Flat',
 'House',
 'Studio',
 'Townhouse',
 "['bed']_0.0",
 "['bed']_1.0",
 "['bed']_2.0",
 "['bed']_3.0",
 "['bed']_4.0",
 "['bath']_1.0",
 "['bath']_2.0",
 "['park']_0",
 "['park']_1",
 "['park']_2"]

In [90]:
index

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 347,
 351,
 355,
 356,
 358,
 359,
 360,
 361,
 362,
 369,
 370,
 375,
 376,
 377]

In [91]:
list_y

[0.060412476133941906,
 0.04451254715569084,
 0.024052833907560762,
 0.006865880418498636,
 0.02903157783540725,
 0.0284070196930715,
 0.027616955190717916,
 0.028173212293713012,
 0.03560997824689642,
 0.04283786103778953,
 0.059372375205735364,
 0.044369476594406744,
 0.047040550519284005,
 0.04887452187633015,
 0.021238687403363515,
 0.045125028899173296,
 0.014864870382618496,
 0.011383837870983434,
 0.01960582624802722,
 0.007426735143058101,
 0.00959876107989313,
 0.06276421712335375,
 0.022192459317046977,
 0.0171812461845,
 0.016638964388753105,
 0.043597273892050856,
 0.04187150809868056,
 0.016288130309707133,
 0.01150396916424754,
 0.012632099262161196]

In [None]:
0.05