In [1]:
import pandas as pd
import numpy as np
from numpy import arange
import os
from collections import defaultdict
import matplotlib.pyplot as plt
from itertools import chain
import statsmodels.api as sm
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_validate
from json import dump, load

In [2]:
POI_COUNTS_DIRECTORY = '../data/curated/poi_counts.json'
POI_TYPES_DIRECTORY = "../data/curated/poi_types.txt"
RENTAL_DIRECTORY = '../data/curated/rental_properties_cleaned.csv'

In [3]:
with open(POI_COUNTS_DIRECTORY, 'r') as f:
        count_data = load(f)

In [4]:
# Code to read back category types if needed
with open(POI_TYPES_DIRECTORY, 'r') as fp:
    ls = fp.read().splitlines()

In [5]:
len(count_data)

5978

In [6]:
len(ls)

270

In [7]:
df = pd.DataFrame.from_dict(count_data, orient='index', columns=ls)

In [8]:
df.reset_index(inplace=True)

In [9]:
df.fillna(0, inplace=True)

In [10]:
df

Unnamed: 0,index,pitch,water,school,swimming_pool,parking,social_facility,fuel,mall,hardware,...,milestone,volcano,garden_furniture,swimming_area,heliport,summer_camp,brewing_supplies,perfumery,pillory,fort
0,"[-36.253557, 144.944557]",12.0,16.0,2.0,8.0,6.0,4.0,2.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"[-36.554159, 146.7208003]",47.0,23.0,3.0,19.0,16.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"[-36.5530096, 146.7188613]",47.0,21.0,3.0,18.0,16.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"[-36.5380049, 145.9927626]",7.0,9.0,3.0,1.0,21.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"[-36.5424309, 145.9979175]",5.0,15.0,3.0,1.0,21.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5973,"[-37.8802849, 147.8299957]",0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5974,"[-37.8791032, 147.8273685]",0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5975,"[-38.1336473, 144.7061689]",0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5976,"[-38.372703, 144.7856897]",0.0,0.0,0.0,0.0,8.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
rental_data = pd.read_csv(RENTAL_DIRECTORY)

In [12]:
cost_data = rental_data[['coordinates', 'cost_text']]
df = pd.concat([df, cost_data], join='inner', ignore_index=True, axis=1).drop(columns=[271])

In [13]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,262,263,264,265,266,267,268,269,270,272
0,"[-36.253557, 144.944557]",12.0,16.0,2.0,8.0,6.0,4.0,2.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.000000
1,"[-36.554159, 146.7208003]",47.0,23.0,3.0,19.0,16.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.923077
2,"[-36.5530096, 146.7188613]",47.0,21.0,3.0,18.0,16.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.384615
3,"[-36.5380049, 145.9927626]",7.0,9.0,3.0,1.0,21.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.000000
4,"[-36.5424309, 145.9979175]",5.0,15.0,3.0,1.0,21.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,28.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5973,"[-37.8802849, 147.8299957]",0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,720.000000
5974,"[-37.8791032, 147.8273685]",0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,720.000000
5975,"[-38.1336473, 144.7061689]",0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,725.000000
5976,"[-38.372703, 144.7856897]",0.0,0.0,0.0,0.0,8.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,725.000000


In [14]:
columns = [['coordinates'], ls, ['cost']]
columns = list(chain.from_iterable(columns))

In [15]:
columns

['coordinates',
 'pitch',
 'water',
 'school',
 'swimming_pool',
 'parking',
 'social_facility',
 'fuel',
 'mall',
 'hardware',
 'police',
 'post_office',
 'pharmacy',
 'beauty',
 'dairy',
 'golf_course',
 'park',
 'hospital',
 'sports_centre',
 'community_centre',
 'caravan_site',
 'kindergarten',
 'place_of_worship',
 'raceway',
 'restaurant',
 'tyres',
 'veterinary',
 'building',
 'alcohol',
 'supermarket',
 'nursing_home',
 'pub',
 'courthouse',
 'fire_station',
 'playground',
 'bicycle',
 'garden',
 'motel',
 'information',
 'library',
 'viewpoint',
 'picnic_site',
 'bbq',
 'bank',
 'museum',
 'peak',
 'memorial',
 'variety_store',
 'monument',
 'bakery',
 'car_wash',
 'artwork',
 'townhall',
 'fast_food',
 'cafe',
 'doityourself',
 'bar',
 'picnic_table',
 'agrarian',
 'laundry',
 'bicycle_parking',
 'sports',
 'fitness_centre',
 'motorcycle',
 'antiques',
 'dentist',
 'florist',
 'clothes',
 'butcher',
 'newsagent',
 'furniture',
 'platform',
 'station',
 'gallery',
 'aerodrome'

In [16]:
df = df.set_axis(columns, axis=1)

In [17]:
df

Unnamed: 0,coordinates,pitch,water,school,swimming_pool,parking,social_facility,fuel,mall,hardware,...,volcano,garden_furniture,swimming_area,heliport,summer_camp,brewing_supplies,perfumery,pillory,fort,cost
0,"[-36.253557, 144.944557]",12.0,16.0,2.0,8.0,6.0,4.0,2.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.000000
1,"[-36.554159, 146.7208003]",47.0,23.0,3.0,19.0,16.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.923077
2,"[-36.5530096, 146.7188613]",47.0,21.0,3.0,18.0,16.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.384615
3,"[-36.5380049, 145.9927626]",7.0,9.0,3.0,1.0,21.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.000000
4,"[-36.5424309, 145.9979175]",5.0,15.0,3.0,1.0,21.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,28.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5973,"[-37.8802849, 147.8299957]",0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,720.000000
5974,"[-37.8791032, 147.8273685]",0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,720.000000
5975,"[-38.1336473, 144.7061689]",0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,725.000000
5976,"[-38.372703, 144.7856897]",0.0,0.0,0.0,0.0,8.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,725.000000


In [18]:
# Save POI data in csv format
df.to_csv('../data/curated/poi_counts.csv')

## Analysis

In [19]:
X, y = df.iloc[:,1:-1], df.iloc[:,-1]
model = ElasticNet()
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=1)
# define grid
grid = dict()
grid['alpha'] = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0.0, 1.0, 10.0, 100.0]
grid['l1_ratio'] = arange(0, 1, 0.1)
# define search
search = GridSearchCV(model, grid, scoring='neg_mean_absolute_error', cv=cv, n_jobs=2)
# perform the search
results = search.fit(X, y)
# summarize
print('MAE: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

KeyboardInterrupt: 

In [None]:
plt.hist(df['cost'], bins = 40)
plt.show()

In [None]:
plt.boxplot(df['cost'])

In [None]:
def get_stats():
    x = df[ls]
    y = df['cost']
    results = sm.OLS(y, x).fit()
    print(results.summary())

In [None]:
get_stats()

In [None]:
x = df[ls]
y = df['cost']

In [None]:
scaler = StandardScaler()
df_scaled = scaler.fit_transform(x)

In [None]:
df_scaled

In [None]:
net = ElasticNet(random_state=0, alpha=0.1, l1_ratio=0)
cross_validate(net, df_scaled, y, cv=10, n_jobs=2, scoring='r2')
#net.fit(df_scaled,y)

In [None]:
coefs = net.coef_
remove_list = []
for i in range(len(coefs)):
    if abs(coefs[i]) < 0.05:
        remove_list.append(ls[i])

In [None]:
remove_list

In [None]:
len(remove_list)

In [None]:
columns = list(chain.from_iterable([['coordinates'],ls]))

In [None]:
df_reduce = df[columns].drop(remove_list, axis=1)

In [None]:
df_reduce.nunique()

In [None]:
df_reduce.to_csv('../data/curated/reduced_POI.csv')