In [1]:
import numpy as np
import os
import multiprocessing
os.environ['USE_PYGEOS'] = '0'
import geopandas as gp
import multiprocessing as mp
from mgwr.gwr import GWR,MGWR
from mgwr.sel_bw import Sel_BW

In [2]:
#Load the funda data
funda_data = gp.read_file("Yúri/data/test_sample_distances(100p).gpkg")

In [3]:
#show funda data
funda_data.head()

Unnamed: 0,...1,house_id,house_type,building_type,price,price_m2,room,bedroom,bathroom,living_area,...,zip,letters,city,addressline_city,addressline_zip,addresszip,bus_dist,subway_dist,train_dist,geometry
0,0.0,42037381.0,huis,Resale property,495000.0,3960.0,4.0,3.0,1.0,125.0,...,1944,KL,Beverwijk\r\n,Beverwijk\r\n,1944 KL,"Wildemanskruid 37, 1944 KL, Beverwijk\r\n",2639.789919,21520.120442,2015.869019,POINT (104832.002 501219.996)
1,1.0,42032180.0,huis,Resale property,450000.0,3982.3,5.0,4.0,1.0,113.0,...,2133,CD,Hoofddorp\r\n,Hoofddorp\r\n,2133 CD,"Birkholm 185, 2133 CD, Hoofddorp\r\n",2261.087447,15526.558113,5372.999947,POINT (106101.113 480140.256)
2,2.0,88477708.0,huis,Resale property,749000.0,5761.5,4.0,3.0,1.0,130.0,...,1606,ME,Venhuizen\r\n,Venhuizen\r\n,1606 ME,"Zuiderdijk 3, 1606 ME, Venhuizen\r\n",3842.606582,37268.173853,4251.110518,POINT (145231.999 521329.999)
3,3.0,42037325.0,huis,Resale property,550000.0,4198.5,5.0,4.0,1.0,131.0,...,1216,HP,Hilversum\r\n,Hilversum\r\n,1216 HP,"Gomarushof 112, 1216 HP, Hilversum\r\n",2977.139577,17977.129564,6912.008049,POINT (137976.519 470153.606)
4,4.0,42017210.0,huis,Resale property,440000.0,3728.8,5.0,4.0,1.0,118.0,...,2151,HH,Nieuw-Vennep\r\n,Nieuw-Vennep\r\n,2151 HH,"Swaenstein 24, 2151 HH, Nieuw-Vennep\r\n",6678.970647,20607.041198,6142.349266,POINT (102036.839 476175.131)


In [4]:
#create array with the dependent variable
b_y = funda_data['price'].values.reshape((-1,1))

In [5]:
#create an array with the indepentend variables (order matters for the extraction of params later)
b_X = funda_data[['room','living_area','house_age', 'bus_dist', 'subway_dist', 'train_dist']].values 

In [6]:
#create coordinate tuple for the model
u = funda_data['geometry'].x
v = funda_data['geometry'].y
b_coords = list(zip(u, v))

In [7]:
multiprocessing.cpu_count()

8

In [8]:
#Parrallelization is more favored when you your data are large and/or your machine have many many cores.
#mgwr has soft dependency of numba, please install numba if you need better performance (pip install numba).
n_proc = 4 #two processors
pool = mp.Pool(n_proc) 

In [15]:
%%time
#Run basic GWR in parrallel mode
gwr_selector = Sel_BW(b_coords, b_y, b_X)
gwr_bw = gwr_selector.search() #add pool to Sel_BW.search
print(gwr_bw)
gwr_results = GWR(b_coords, b_y, b_X, gwr_bw).fit()

84.0
CPU times: total: 13.2 s
Wall time: 5.51 s


In [18]:
#show summary
gwr_results.summary()

Model type                                                         Gaussian
Number of observations:                                                1354
Number of covariates:                                                     7

Global Regression Results
---------------------------------------------------------------------------
Residual sum of squares:                                       54226586157784.891
Log-likelihood:                                                  -18449.097
AIC:                                                              36912.194
AICc:                                                             36914.301
BIC:                                                           54226586148071.922
R2:                                                                   0.658
Adj. R2:                                                              0.656

Variable                              Est.         SE  t(Est/SE)    p-value
------------------------------- ---------- -----

In [9]:
%%time
#run MGWR in parrallel mode. Note: max_iter_multi needs to be specified
mgwr_selector = Sel_BW(b_coords, b_y, b_X, multi=True)
mgwr_bw = mgwr_selector.search(pool=pool, max_iter_multi=5, criterion = "AICc") #add pool to Sel_BW.search
print(mgwr_bw)
mgwr_results = MGWR(b_coords, b_y, b_X, selector=mgwr_selector).fit(pool=pool)


KeyboardInterrupt



In [10]:
#show MGWR model summary
mgwr_results.summary()

KeyboardInterrupt: 

In [14]:
#recreate R format table
df = gp.GeoDataFrame()
df['Intercept'] = mgwr_results.params[:,0]
df['room'] = mgwr_results.params[:,1]
df['living_area'] = mgwr_results.params[:,2]
df['house_age'] = mgwr_results.params[:,3]

df['yhat'] = mgwr_results.predy
df['residual'] = mgwr_results.y.reshape((-1,1)) - mgwr_results.predy

df['intercept_SE'] = mgwr_results.bse[:,0]
df['room_SE'] = mgwr_results.bse[:,1]
df['living_area_SE'] = mgwr_results.bse[:,2]
df['house_age_SE'] = mgwr_results.bse[:,3]

df['intercept_TV'] = mgwr_results.tvalues[:,0]
df['room_TV'] = mgwr_results.tvalues[:,1]
df['living_area_TV'] = mgwr_results.tvalues[:,2]
df['house_age_TV'] = mgwr_results.tvalues[:,3]

df['geometry'] = funda_data['geometry']

In [15]:
#show the GeoDataFrame and write it to a folder
df.head()
df.to_file('data/test/test_mgwr.gpkg')