In [1]:
!pip install geopandas
!pip install mgwr



In [1]:
import numpy as np
import os
import multiprocessing
os.environ['USE_PYGEOS'] = '0'
import geopandas as gp
import multiprocessing as mp
from mgwr.gwr import GWR,MGWR
from mgwr.sel_bw import Sel_BW

In [2]:
#Load the funda data
funda_data = gp.read_file("data/funda_buy_28-03-2023_full.gpkg")

In [3]:
#show funda data
funda_data.head()

Unnamed: 0,...1,house_id,house_type,building_type,price,price_m2,room,bedroom,bathroom,living_area,...,ym_list,year_list,descrip,zip,letters,city,addressline_city,addressline_zip,addresszip,geometry
0,1.0,88438538.0,huis,Resale property,515000.0,4327.7,6.0,4.0,0.0,119.0,...,2023-01-01,2023.0,\nHeerlijk wonen aan de rand van het centru...,1791,DD,Den,Den Burg\r\n,1791 DD,"Elemert 20, 1791 DD, Den, Nederland",POINT (115415.102 562934.410)
1,2.0,42954353.0,huis,Resale property,400000.0,3669.7,5.0,4.0,1.0,109.0,...,2022-09-01,2022.0,\nEen unieke woning met garage nabij het st...,1949,CG,Wijk,Wijk aan Zee\r\n,1949 CG,"Burgemeester Rothestraat 88, 1949 CG, Wijk, Ne...",POINT (101584.001 500981.995)
2,3.0,42048957.0,huis,Resale property,449000.0,3805.1,6.0,4.0,1.0,118.0,...,2023-03-01,2023.0,"\nEen fraai uitgevoerde tussenwoning, uitgeb...",1901,TD,Castricum\r\n,Castricum\r\n,1901 TD,"Eerste Groenelaan 54, 1901 TD, Castricum\r\n, ...",POINT (105924.643 507742.212)
3,5.0,42026862.0,huis,Resale property,550000.0,5000.0,4.0,3.0,1.0,110.0,...,2023-02-01,2023.0,\nSchedule a viewing directly online? Go to...,1066,TH,Amsterdam\r\n,Amsterdam\r\n,1066 TH,"Oudenaardeplantsoen 21, 1066 TH, Amsterdam\r\n...",POINT (115144.000 484579.000)
4,6.0,42046726.0,huis,Resale property,695000.0,3948.9,6.0,4.0,1.0,176.0,...,2023-03-01,2023.0,"\nRust, ruimte, privacy en een keurig afwer...",2134,WJ,Hoofddorp\r\n,Hoofddorp\r\n,2134 WJ,"Zandkreek 21, 2134 WJ, Hoofddorp\r\n, Nederland",POINT (103400.326 480554.963)


In [4]:
#create array with the dependent variable
b_y = funda_data['price'].values.reshape((-1,1))

In [6]:
#create an array with the indepentend variables (order matters for the extraction of params later)
b_X = funda_data[['room','living_area','house_age']].values 

In [7]:
#create coordinate tuple for the model
u = funda_data['geometry'].x
v = funda_data['geometry'].y
b_coords = list(zip(u, v))

In [8]:
multiprocessing.cpu_count()

8

In [10]:
#Parrallelization is more favored when you your data are large and/or your machine have many many cores.
#mgwr has soft dependency of numba, please install numba if you need better performance (pip install numba).
n_proc = 4 #two processors
pool = mp.Pool(n_proc) 

In [11]:
%%time
#Run basic GWR in parrallel mode
if __name__ == '__main__':
    gwr_selector = Sel_BW(b_coords, b_y, b_X)
    gwr_bw = gwr_selector.search(pool = pool) #add pool to Sel_BW.search
    print(gwr_bw)
    gwr_results = GWR(b_coords, b_y, b_X, gwr_bw).fit(pool = pool)

51.0
CPU times: total: 2.59 s
Wall time: 47.5 s


In [12]:
#show summary
gwr_results.summary()

Model type                                                         Gaussian
Number of observations:                                                7250
Number of covariates:                                                     4

Global Regression Results
---------------------------------------------------------------------------
Residual sum of squares:                                       1011146779551883.000
Log-likelihood:                                                 -103308.809
AIC:                                                             206625.618
AICc:                                                            206627.627
BIC:                                                           1011146779487475.125
R2:                                                                   0.538
Adj. R2:                                                              0.538

Variable                              Est.         SE  t(Est/SE)    p-value
------------------------------- ---------- -

In [13]:
%%time
#run MGWR in parrallel mode. Note: max_iter_multi needs to be specified
if __name__ == '__main__':
    mgwr_selector = Sel_BW(b_coords, b_y, b_X, multi=True)
    mgwr_bw = mgwr_selector.search(pool=pool, max_iter_multi=5, criterion = "AICc") #add pool to Sel_BW.search
    print(mgwr_bw)
    mgwr_results = MGWR(b_coords, b_y, b_X, selector=mgwr_selector).fit(pool=pool)

Backfitting:   0%|          | 0/5 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [13]:
#show MGWR model summary
mgwr_results.summary()

Model type                                                         Gaussian
Number of observations:                                                1354
Number of covariates:                                                     7

Global Regression Results
---------------------------------------------------------------------------
Residual sum of squares:                                       54226586157784.898
Log-likelihood:                                                  -18449.097
AIC:                                                              36912.194
AICc:                                                             36914.301
BIC:                                                           54226586148071.930
R2:                                                                   0.658
Adj. R2:                                                              0.656

Variable                              Est.         SE  t(Est/SE)    p-value
------------------------------- ---------- -----

  return 1. / (self.link.deriv(mu)**2 * self.variance(mu))


In [14]:
#recreate R format table
df = gp.GeoDataFrame()
df['Intercept'] = mgwr_results.params[:,0]
df['room'] = mgwr_results.params[:,1]
df['living_area'] = mgwr_results.params[:,2]
df['house_age'] = mgwr_results.params[:,3]

df['yhat'] = mgwr_results.predy
df['residual'] = mgwr_results.y.reshape((-1,1)) - mgwr_results.predy

df['intercept_SE'] = mgwr_results.bse[:,0]
df['room_SE'] = mgwr_results.bse[:,1]
df['living_area_SE'] = mgwr_results.bse[:,2]
df['house_age_SE'] = mgwr_results.bse[:,3]

df['intercept_TV'] = mgwr_results.tvalues[:,0]
df['room_TV'] = mgwr_results.tvalues[:,1]
df['living_area_TV'] = mgwr_results.tvalues[:,2]
df['house_age_TV'] = mgwr_results.tvalues[:,3]

df['geometry'] = funda_data['geometry']

In [15]:
#show the GeoDataFrame and write it to a folder
df.head()
df.to_file('data/test/test_mgwr.gpkg')