In [1]:
% load_ext autoreload
% autoreload 2


In [2]:
import glob
import os
import numpy as np
import pandas as pd
from collections import defaultdict
import datetime
import src.mex_helper as mex
import folium
from src.utils.map_vis import time_slider_choropleth, geojson_per_row

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
import src.utils.gis as gis
from src.features import *

In [5]:
def grid_avgerage(tw_avg, t2g):
    # there are grids without any call throughout the observation period
    g_avg = t2g.merge(tw_avg, left_on='gtid', right_index=True, how='left')

    for h in range(24):
        h=str(h)
        g_avg[h] = g_avg[h] * g_avg['weight']

    g_avg= g_avg.drop(['gtid','weight'], axis=1).groupby('grid').sum()  # fillna=0 by default

    return g_avg

In [6]:
aver_out = mex.stat_tw_dow_aver_hr_uniq_user('out')
tw_avg_wd = pd.DataFrame(aver_out['wd']).T
tw_avg_wk = pd.DataFrame(aver_out['wk']).T

# mex_t2g = mex.tower2grid('cities', 1000)
mex_t2g = mex.tower2grid('urban_areas_16', 1000)

print('number of towers in cities has no call at all during weekday and weekend', len(set(mex_t2g.gtid) - set(tw_avg_wd.index)), len(set(mex_t2g.gtid) - set(tw_avg_wk.index)))


loading cached tw average stats/stat_tw_dow_aver_hr_uniq_user-out.pickle
reading existing t2g file: data/mex_tower/mex_t2g_urban_areas_16_1000m.csv
number of towers in cities has no call at all during weekday and weekend 146 146


In [7]:
# g_avg = pd.DataFrame([average number of calls], index=grid, columns='hour')
g_avg_wd = grid_avgerage(tw_avg_wd, mex_t2g)
g_avg_wk = grid_avgerage(tw_avg_wk, mex_t2g)

In [10]:
%%time
dv_cities = urban_dilatation_index(g_avg_wd,'urban_areas_16', 'urban')

reading existing grids
CPU times: user 1.79 s, sys: 4 ms, total: 1.8 s
Wall time: 1.79 s


In [9]:
%%time
n_hotspot_cities, hotspot_stats_cities = hotspot_stats(g_avg_wd,'urban_areas_16', 'urban')

reading existing grids
CPU times: user 2.5 s, sys: 24 ms, total: 2.53 s
Wall time: 2.52 s


In [11]:
features = pd.concat([dv_cities, n_hotspot_cities, hotspot_stats_cities],axis=1)

In [12]:
crimes = pd.read_csv('data/qbased_crime_mexico.csv', index_col=0)

In [13]:
from scipy.stats import pearsonr, spearmanr, kendalltau

In [14]:
sig = {}
for c in features:
    pear_r, pear_p = pearsonr(features[c].values, crimes.total_crime.values)
    spea_r, spea_p = spearmanr(features[c].values, crimes.total_crime.values)
    kend_r, kend_p = kendalltau(features[c].values, crimes.total_crime.values)
    sig[c] = {'pear_r': pear_r, 'pear_p':pear_p, 'spea_r': spea_r, 'spea_p':spea_p,  'kend_r': kend_r, 'kend_p':kend_p}
sig = pd.DataFrame(sig).T

In [15]:
sig[sig.kend_p<0.1]

Unnamed: 0,kend_p,kend_r,pear_p,pear_r,spea_p,spea_r
n_int,0.093839,0.314965,0.339695,0.255419,0.070352,0.463815


In [16]:
import statsmodels.api as sm

In [17]:
features = features.loc[crimes.index]

In [18]:
Y = crimes.total_crime.values
X = n_hotspot_cities.loc[crimes.index].iloc[:,8:19].values
# X = sm.add_constant(X)
model = sm.OLS(Y,X)
results = model.fit()

results.summary()

  "anyway, n=%i" % int(n))


0,1,2,3
Dep. Variable:,y,R-squared:,0.952
Model:,OLS,Adj. R-squared:,0.845
Method:,Least Squares,F-statistic:,8.921
Date:,"Mon, 21 Jan 2019",Prob (F-statistic):,0.0128
Time:,19:49:40,Log-Likelihood:,23.408
No. Observations:,16,AIC:,-24.82
Df Residuals:,5,BIC:,-16.32
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,0.0105,0.038,0.274,0.795,-0.088,0.109
x2,0.0668,0.062,1.071,0.333,-0.094,0.227
x3,-0.1407,0.087,-1.619,0.166,-0.364,0.083
x4,-0.0460,0.121,-0.380,0.719,-0.357,0.265
x5,0.2056,0.268,0.768,0.477,-0.483,0.894
x6,-0.0978,0.205,-0.478,0.653,-0.624,0.428
x7,-0.0907,0.145,-0.625,0.559,-0.464,0.282
x8,0.0679,0.061,1.105,0.319,-0.090,0.226
x9,0.0171,0.091,0.188,0.858,-0.218,0.252

0,1,2,3
Omnibus:,2.02,Durbin-Watson:,1.434
Prob(Omnibus):,0.364,Jarque-Bera (JB):,1.605
Skew:,0.686,Prob(JB):,0.448
Kurtosis:,2.274,Cond. No.,1690.0
