In [1]:
!pip install geopandas
!pip install mgwr
!pip install numba



In [2]:
import numpy as np
import pandas as pd
import pickle
import os
import multiprocessing
os.environ['USE_PYGEOS'] = '0'
import geopandas as gp
import multiprocessing as mp
from mgwr.gwr import GWR,MGWR
from mgwr.sel_bw import Sel_BW

In [7]:
#Load the funda data
funda_data = gp.read_file("data/funda_buy_28-03-2023_full_distances.gpkg")

# funda_data = pd.DataFrame(funda_data)

# funda_data = gp.GeoDataFrame(pd.get_dummies(funda_data, columns = ["house_type", "building_type", "energy_label", "has_balcony", "has_garden"]))

In [8]:
#show funda data
funda_data['house_type'] = funda_data['house_type'].astype('category')
funda_data.head()

Unnamed: 0,...1,...2,house_id,house_type,building_type,price,price_m2,room,bedroom,bathroom,...,st_perimet,bus_dist,subway_dist,train_dist,university_dist,school_dist,hospital_dist,mall_dist,supermarket_dist,geometry
0,2.0,1.0,88438538.0,huis,Resale property,515000.0,4327.7,6.0,4.0,0.0,...,661793.548709,48.287385,72823.821958,37242.168681,45737.145495,114.856918,10887.306844,10673.161006,346.305218,POINT (115415.102 562934.410)
1,3.0,2.0,42954353.0,huis,Resale property,400000.0,3669.7,5.0,4.0,1.0,...,661793.548709,2238.827293,24032.154788,4082.658441,11607.008159,534.086517,612.562468,4174.955689,229.425515,POINT (101584.001 500981.995)
2,4.0,3.0,42048957.0,huis,Resale property,449000.0,3805.1,6.0,4.0,1.0,...,661793.548709,8967.645367,24944.862763,8506.276991,8812.450336,226.402517,7989.265875,800.931212,489.383006,POINT (105924.643 507742.212)
3,6.0,5.0,42026862.0,huis,Resale property,550000.0,5000.0,4.0,3.0,1.0,...,661793.548709,2814.126342,6056.986976,3496.778087,3194.255205,256.180597,1518.237236,3800.247983,490.721764,POINT (115144.000 484579.000)
4,7.0,6.0,42046726.0,huis,Resale property,695000.0,3948.9,6.0,4.0,1.0,...,661793.548709,2282.53635,18086.155516,2811.469742,8697.159686,489.214342,1983.48371,2225.233823,678.15557,POINT (103400.326 480554.963)


In [9]:
#create array with the dependent variable
b_y = funda_data['price'].values.reshape((-1,1))

In [10]:
# #create an array with the indepentend variables (order matters for the extraction of params later)
# b_X = funda_data[['room', 'bedroom', 'bathroom', 'living_area',
#                   'house_age', 'bus_dist', 'subway_dist', 'train_dist',
#                   'university_dist', 'school_dist', 'mall_dist', 'supermarket_dist',
#                   'energy_label_A', 'energy_label_B', 'energy_label_C', 'energy_label_D',
#                   'energy_label_E', 'energy_label_F', 'energy_label_G', 'energy_label_na',
#                   'has_balcony_0.0', 'has_balcony_1.0', 'has_garden_0.0', 'has_garden_1.0']].values 

cols = ['room', 'bedroom', 'bathroom', 'living_area',
                  'house_age', 'bus_dist', 'subway_dist', 'train_dist',
                  'university_dist', 'school_dist', 'mall_dist', 'supermarket_dist']

b_X = funda_data[cols].values 

In [11]:
#create coordinate tuple for the model
u = funda_data['geometry'].x
v = funda_data['geometry'].y
b_coords = list(zip(u, v))

In [12]:
multiprocessing.cpu_count()

8

In [13]:
#This might be needed to turn off the OpenMP multi-threading
%env OMP_NUM_THREADS = 1

env: OMP_NUM_THREADS=1


In [16]:
#Parrallelization is more favored when you your data are large and/or your machine have many many cores.
#mgwr has soft dependency of numba, please install numba if you need better performance (pip install numba).
n_proc = 4 #two processors
pool = mp.Pool(n_proc) 

In [17]:
%%time
#Run basic GWR in parrallel mode

gwr_selector = Sel_BW(b_coords, b_y, b_X)
gwr_bw = gwr_selector.search(pool = pool) #add pool to Sel_BW.search
print(gwr_bw)
gwr_results = GWR(b_coords, b_y, b_X, gwr_bw).fit(pool = pool)

132.0
CPU times: total: 344 ms
Wall time: 2min 3s


In [18]:
#show summary
gwr_results.summary()

Model type                                                         Gaussian
Number of observations:                                                7211
Number of covariates:                                                    13

Global Regression Results
---------------------------------------------------------------------------
Residual sum of squares:                                       790706956935048.125
Log-likelihood:                                                 -101885.887
AIC:                                                             203797.774
AICc:                                                            203799.833
BIC:                                                           790706956871105.625
R2:                                                                   0.633
Adj. R2:                                                              0.633

Variable                              Est.         SE  t(Est/SE)    p-value
------------------------------- ---------- ---

In [None]:
%%time
#run MGWR in parrallel mode. Note: max_iter_multi needs to be specified

mgwr_selector = Sel_BW(b_coords, b_y, b_X, multi=True)
mgwr_bw = mgwr_selector.search(pool=pool, max_iter_multi=200, criterion = "AICc") #add pool to Sel_BW.search
print(mgwr_bw)
mgwr_results = MGWR(b_coords, b_y, b_X, selector=mgwr_selector).fit(pool=pool)

Backfitting:   0%|          | 0/200 [00:00<?, ?it/s]

In [None]:
#show MGWR model summary
mgwr_results.summary()

In [24]:
#recreate R format table

def return_geopackage(mgwr_results,cols, path_name):
    df = gp.GeoDataFrame()
    df['Intercept'] = mgwr_results.params[:,0]
    df['intercept_SE'] = mgwr_results.bse[:,0]
    df['intercept_TV'] = mgwr_results.tvalues[:,0]
    
    df['yhat'] = mgwr_results.predy
    df['residual'] = mgwr_results.y.reshape((-1,1)) - mgwr_results.predy
    
    for i,col in enumerate(cols):
        
        df[col] = mgwr_results.params[:,i+1]

        df[col+'_SE'] = mgwr_results.bse[:,i+1]
        df[col+'_TV'] = mgwr_results.tvalues[:,i+1]
        
    df['geometry'] = funda_data['geometry']
    df.to_file(path_name)

In [25]:
#show the GeoDataFrame and write it to a folder
return_geopackage(gwr_results,cols, 'data/gwr_results_full.gpkg')

In [87]:
# pickle.dump(gwr_results, open('data/models/gwr_results.pkl', 'wb'))

In [8]:
# gwr_results = pickle.load(open('data/models/gwr_results.pkl', 'rb'))