In [None]:
import os
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
pd.options.mode.chained_assignment = None  #close copy warning   

from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif, f_regression, RFE, SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from lightgbm import LGBMClassifier

In [None]:
property_df = pd.read_csv("../data/curated/data.csv")

In [None]:
print(list(property_df.columns))

In [None]:
def convert_suburb(df):
    suburb_lis = list(df['suburb'].unique())
    for i in range(1,len(suburb_lis)+1):
        df['suburb'] = df['suburb'].replace(suburb_lis[i-1],i)
    return df

property_df['type'] = property_df['type'].replace('Apartment / Unit / Flat / Penhouse', 1)
property_df['type'] = property_df['type'].replace('Studio', 2)
property_df['type'] = property_df['type'].replace('Townhouse','House')
property_df['type'] = property_df['type'].replace('House', 3)
property_df = convert_suburb(property_df)

property_df = property_df.drop(columns=['postcode'])
property_df = property_df.fillna(-1)



In [None]:
property_df.isnull().value_counts()

In [None]:
property_df.corr().abs().iloc[0].to_frame().sort_values(by=['rent_weekly'],ascending=False).head(20)

In [None]:
internal = property_df[['address', 'rent_weekly', 'floor', 'suburb', 'type', 'furnitured', 'pool', 'gym', 'num_bed', 'num_bath', 'num_car_park']]
external = property_df[['address', 'rent_weekly','SECONDARY SCHOOL', 'PRIMARY AND SECONDARY SCHOOL', 'PRIMARY SCHOOL', 'SHOPPING CENTRE', 'PARK', 'KINDERGARTEN', 'POST OFFICE', 'SPECIAL SCHOOL', 'CHILD CARE', 'BAR', 'SPORTS COMPLEX', 'EDUCATION COMPLEX', 'CEMETERY', 'LIBRARY', 'UNIVERSITY', 'GENERAL HOSPITAL', 'POLICE STATION', 'FURTHER EDUCATION', 'PLAYGROUND', 'PLANTATION', 'WINERY', 'FARM', 'VINEYARD', 'TENNIS COURT', 'BAY', 'PIER', 'SWIMMING POOL', 'ART GALLERY', 'HARBOUR', 'COMMUNITY HEALTH CENTRE', 'MARKET', 'PRISON', 'BOTANIC GARDENS', 'SPECIALISED HOSPITAL', 'BANK', 'AGED CARE', 'SURFING SPOT', 'COAST', 'GOLF COURSE', 'HELIPORT', 'AIRPORT', 'IRON ORE PROCESSOR', 'MILL/TIMBER OPERATIONS', 'BEACH', 'TRAIN STATION', 'TRAM STATION', 'BUS']]
dimension = property_df[['address', 'rent_weekly','Education', 'Commercial', 'Environment', 'Medication', 'Secrity', 'Industrial', 'Prison', 'Transportation']]

In [None]:
internal.corr().abs().iloc[0].to_frame().sort_values(by=['rent_weekly'],ascending=False)

In [None]:
internal[internal['type'] == 1].corr().abs().iloc[0].to_frame().sort_values(by=['rent_weekly'],ascending=False)

In [None]:
internal[internal['type'] == 3].corr().abs().iloc[0].to_frame().sort_values(by=['rent_weekly'],ascending=False)

In [None]:
external.corr().abs().iloc[0].to_frame().sort_values(by=['rent_weekly'],ascending=False).head(15)

In [None]:
dimension.corr().abs().iloc[0].to_frame().sort_values(by=['rent_weekly'],ascending=False).head(15)

In [None]:
col_lis = ['floor', 'suburb', 'type', 'furnitured', 'pool', 'gym', 'num_bed', 'num_bath', 'num_car_park', 'SECONDARY SCHOOL', 'PRIMARY AND SECONDARY SCHOOL', 'PRIMARY SCHOOL', 'SHOPPING CENTRE', 'PARK', 'KINDERGARTEN', 'POST OFFICE', 'SPECIAL SCHOOL', 'CHILD CARE', 'BAR', 'SPORTS COMPLEX', 'EDUCATION COMPLEX', 'CEMETERY', 'LIBRARY', 'UNIVERSITY', 'GENERAL HOSPITAL', 'POLICE STATION', 'FURTHER EDUCATION', 'PLAYGROUND', 'PLANTATION', 'WINERY', 'FARM', 'VINEYARD', 'TENNIS COURT', 'BAY', 'PIER', 'SWIMMING POOL', 'ART GALLERY', 'HARBOUR', 'COMMUNITY HEALTH CENTRE', 'MARKET', 'PRISON', 'BOTANIC GARDENS', 'SPECIALISED HOSPITAL', 'BANK', 'AGED CARE', 'SURFING SPOT', 'COAST', 'GOLF COURSE', 'HELIPORT', 'AIRPORT', 'IRON ORE PROCESSOR', 'MILL/TIMBER OPERATIONS', 'BEACH', 'TRAIN STATION', 'TRAM STATION', 'BUS', 'Education', 'Commercial', 'Environment', 'Medication', 'Secrity', 'Industrial', 'Transportation', 'cloest_SECONDARY SCHOOL', 'cloest_PRIMARY AND SECONDARY SCHOOL', 'cloest_PRIMARY SCHOOL', 'cloest_SHOPPING CENTRE', 'cloest_PARK', 'cloest_KINDERGARTEN', 'cloest_POST OFFICE', 'cloest_SPECIAL SCHOOL', 'cloest_CHILD CARE', 'cloest_BAR', 'cloest_SPORTS COMPLEX', 'cloest_EDUCATION COMPLEX', 'cloest_CEMETERY', 'cloest_LIBRARY', 'cloest_UNIVERSITY', 'cloest_GENERAL HOSPITAL', 'cloest_POLICE STATION', 'cloest_FURTHER EDUCATION', 'cloest_PLAYGROUND', 'cloest_PLANTATION', 'cloest_WINERY', 'cloest_FARM', 'cloest_VINEYARD', 'cloest_TENNIS COURT', 'cloest_BAY', 'cloest_PIER', 'cloest_SWIMMING POOL', 'cloest_ART GALLERY', 'cloest_HARBOUR', 'cloest_COMMUNITY HEALTH CENTRE', 'cloest_MATERNAL AND CHILD HEALTH CENTRE', 'cloest_MARKET', 'cloest_PRISON', 'cloest_BOTANIC GARDENS', 'cloest_SPECIALISED HOSPITAL', 'cloest_BANK', 'cloest_AGED CARE', 'cloest_SURFING SPOT', 'cloest_COAST', 'cloest_GOLF COURSE', 'cloest_HELIPORT', 'cloest_AIRPORT', 'cloest_IRON ORE PROCESSOR', 'cloest_MILL/TIMBER OPERATIONS', 'cloest_BEACH', 'cloest_TRAM STATION', 'cloest_BUS', 'cloest_TRAIN_STATION_duration', 'cloest_TRAIN_STATION_distance']
X = property_df[col_lis].copy()
y = property_df['rent_weekly']
X_norm = MinMaxScaler().fit_transform(X)

In [None]:
mi_selector = SelectKBest(mutual_info_classif, k=15)
mi_selector.fit(X, y)
mi_scores = mi_selector.scores_
mi_support = mi_selector.get_support()
mi_feature = X.loc[:,mi_support].columns.tolist()
idx = [x for x in range(len(mi_support)) if mi_support[x]]
score = {}
for i in idx:
    score[col_lis[i]] = mi_scores[i]
pd.DataFrame.from_dict({'features':score.keys(), 'mi_value':score.values()}).sort_values(by=['mi_value'], ascending=False)

In [None]:
chi_selector = SelectKBest(chi2, k=15)
chi_selector.fit(X_norm, y)
chi_scores = chi_selector.scores_
chi_pvalues = chi_selector.pvalues_
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
idx = [x for x in range(len(chi_support)) if chi_support[x]]
score = {}
for i in idx:
    score[col_lis[i]] = chi_scores[i]
print(score)

pvalue = {}
for i in idx:
    pvalue[col_lis[i]] = chi_pvalues[i]
print(pvalue)

pd.DataFrame.from_dict({'features':score.keys(), 'chi2_value':score.values()}).sort_values(by=['chi2_value'], ascending=False)

In [None]:
def cor_selector(X, y,num_feats):
    cor_list = []
    feature_name = X.columns.tolist()
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # feature name
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-num_feats:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature

cor_support, cor_feature = cor_selector(X, y,15)
print(str(len(cor_feature)), 'selected features')

In [None]:
rfe_selector = RFE(estimator=LogisticRegression(max_iter=10000), n_features_to_select=15, step=10, verbose=5)
rfe_selector.fit(X_norm, y)
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()
rfe_feature

In [None]:
embeded_lr_selector = SelectFromModel(LogisticRegression(solver='liblinear',penalty="l1",max_iter=10000), max_features=15)
embeded_lr_selector.fit(X_norm, y)

embeded_lr_support = embeded_lr_selector.get_support()
embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist()

embeded_lr_feature

In [None]:
embeded_rf_selector = SelectFromModel(RandomForestRegressor(n_estimators = 10000, max_features = 'sqrt', max_depth = 50, random_state = 50), max_features=15)
embeded_rf_selector.fit(X, y)

embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()
embeded_rf_feature


In [None]:
from sklearn.feature_selection import SelectFromModel
from lightgbm import LGBMClassifier

lgbc=LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2,
            reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)

embeded_lgb_selector = SelectFromModel(lgbc, max_features=len(col_lis))
embeded_lgb_selector.fit(X, y)

embeded_lgb_support = embeded_lgb_selector.get_support()
embeded_lgb_feature = X.loc[:,embeded_lgb_support].columns.tolist()
embeded_lgb_feature

In [None]:
# put all selection together
feature_selection_df = pd.DataFrame({'Feature':X.columns.to_list(), 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support, 'Logistics':embeded_lr_support,
                                    'Random Forest':embeded_rf_support, 'LightGBM':embeded_lgb_support, 'MI':mi_support})
# count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
# display the top 100
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)

In [None]:
model = ExtraTreesClassifier()
model.fit(X,y)
#print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.show()

In [None]:
rf = RandomForestRegressor(n_estimators = 10000, max_features = 'sqrt', max_depth = 50, random_state = 50)
rf.fit(X, y)

rf_feat_importances = pd.Series(rf.feature_importances_, index=X.columns)
rf_feat_importances.nlargest(10).plot(kind='barh',color=['tomato', 'peru', 'yellow', 'olive', 'lime', 'aquamarine', 'darkslategray',
                            'mediumblue', 'violet', 'purple'])
plt.show()

In [None]:
lgb_selector = LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2,
                              reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)
lgb_selector.fit(X, y)
lgb_feat_importances = pd.Series(lgb_selector.feature_importances_, index=X.columns)
lgb_feat_importances.nlargest(10).plot(kind='barh',color=['tomato', 'peru', 'yellow', 'olive', 'lime', 'aquamarine', 'darkslategray',
                            'mediumblue', 'violet', 'purple'])
plt.show()

In [None]:
dt=DecisionTreeRegressor(max_depth=100)
dt.fit(X.reshape(-1,1),y)

dt_feat_importances = pd.Series(dt.feature_importances_, index=X.columns)
dt_feat_importances.nlargest(10).plot(kind='barh',color=['tomato', 'peru', 'yellow', 'olive', 'lime', 'aquamarine', 'darkslategray',
                            'mediumblue', 'violet', 'purple'])
plt.show()

In [None]:
X = property_df[property_df['type'] == 1].copy()
X = property_df[col_lis]
y = property_df['rent_weekly']
X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=15)
chi_selector.fit(X_norm, y)
chi_scores = chi_selector.scores_
chi_pvalues = chi_selector.pvalues_
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
idx = [x for x in range(len(chi_support)) if chi_support[x]]
score = {}
for i in idx:
    score[col_lis[i]] = chi_scores[i]
print(score)

pvalue = {}
for i in idx:
    pvalue[col_lis[i]] = chi_pvalues[i]
print(pvalue)

pd.DataFrame.from_dict({'features':score.keys(), 'chi2_value':score.values()}).sort_values(by=['flat_chi2_value'], ascending=False)

In [None]:
X = property_df[property_df['type'] == 3].copy()
X = property_df[col_lis]
y = property_df['rent_weekly']
X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=15)
chi_selector.fit(X_norm, y)
chi_scores = chi_selector.scores_
chi_pvalues = chi_selector.pvalues_
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
idx = [x for x in range(len(chi_support)) if chi_support[x]]
score = {}
for i in idx:
    score[col_lis[i]] = chi_scores[i]
print(score)

pvalue = {}
for i in idx:
    pvalue[col_lis[i]] = chi_pvalues[i]
print(pvalue)

pd.DataFrame.from_dict({'features':score.keys(), 'chi2_value':score.values()}).sort_values(by=['house_chi2_value'], ascending=False)