In [1]:
% load_ext autoreload
% autoreload 2


In [2]:
import glob
import os
import numpy as np
import pandas as pd
from collections import defaultdict
import datetime
import src.mex_helper as mex
import folium
from src.utils.map_vis import time_slider_choropleth, geojson_per_row

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
import src.utils.gis as gis
from src.features import *

In [5]:
def grid_avgerage(tw_avg, t2g):
    # there are grids without any call throughout the observation period
    g_avg = t2g.merge(tw_avg, left_on='gtid', right_index=True, how='left')

    for h in range(24):
        h=str(h)
        g_avg[h] = g_avg[h] * g_avg['weight']

    g_avg= g_avg.drop(['gtid','weight'], axis=1).groupby('grid').sum()  # fillna=0 by default

    return g_avg

In [6]:
aver_out = mex.stat_tw_dow_aver_hr_uniq_user('out')
tw_avg_wd = pd.DataFrame(aver_out['wd']).T
tw_avg_wk = pd.DataFrame(aver_out['wk']).T

mex_t2g = mex.tower2grid('cities', 1000)
print('number of towers in cities has no call at all during weekday and weekend', len(set(mex_t2g.gtid) - set(tw_avg_wd.index)), len(set(mex_t2g.gtid) - set(tw_avg_wk.index)))


loading cached tw average stats/stat_tw_dow_aver_hr_uniq_user-out.pickle
reading existing t2g file: data/mex_tower/mex_t2g_cities_1000m.csv
number of towers in cities has no call at all during weekday and weekend 233 234


In [7]:
# g_avg = pd.DataFrame([average number of calls], index=grid, columns='hour')
g_avg_wd = grid_avgerage(tw_avg_wd, mex_t2g)
g_avg_wk = grid_avgerage(tw_avg_wk, mex_t2g)

In [46]:
%%time
dv_cities = urban_dilatation_index(g_avg_wd)

reading existing grids
CPU times: user 1min 4s, sys: 13.3 s, total: 1min 17s
Wall time: 1min 17s


In [39]:
%%time
n_hotspot_cities, hotspot_stats_cities = hotspot_stats(g_avg_wd)

reading existing grids
CPU times: user 7.58 s, sys: 28 ms, total: 7.61 s
Wall time: 7.6 s


In [49]:
features = pd.concat([dv_cities, n_hotspot_cities, hotspot_stats_cities],axis=1)

In [52]:
crimes = pd.read_csv('data/qbased_crime_mexico.csv', index_col=0)

In [80]:
from scipy.stats import pearsonr, spearmanr, kendalltau

In [81]:
sig = {}
for c in features:
    pear_r, pear_p = pearsonr(features[c].values, crimes.total_crime.values)
    spea_r, spea_p = spearmanr(features[c].values, crimes.total_crime.values)
    kend_r, kend_p = kendalltau(features[c].values, crimes.total_crime.values)
    sig[c] = {'pear_r': pear_r, 'pear_p':pear_p, 'spea_r': spea_r, 'spea_p':spea_p,  'kend_r': kend_r, 'kend_p':kend_p}
sig = pd.DataFrame(sig).T

In [82]:
sig[sig.kend_p<0.1]

Unnamed: 0,kend_p,kend_r,pear_p,pear_r,spea_p,spea_r
nhot_20,0.078805,0.326363,0.0793,0.451325,0.089267,0.438558
nhot_21,0.094513,0.312261,0.145895,0.380569,0.123537,0.401182
n_med,0.086467,0.319339,0.22686,0.320056,0.116742,0.407953


In [54]:
import statsmodels.api as sm

In [57]:
features = features.loc[crimes.index]

In [113]:
Y = crimes.total_crime.values
X = n_hotspot_cities.loc[crimes.index].iloc[:,8:19].values
# X = sm.add_constant(X)
model = sm.OLS(Y,X)
results = model.fit()

results.summary()

  "anyway, n=%i" % int(n))


0,1,2,3
Dep. Variable:,y,R-squared:,0.993
Model:,OLS,Adj. R-squared:,0.978
Method:,Least Squares,F-statistic:,64.3
Date:,"Thu, 10 Jan 2019",Prob (F-statistic):,0.000117
Time:,11:31:42,Log-Likelihood:,38.869
No. Observations:,16,AIC:,-55.74
Df Residuals:,5,BIC:,-47.24
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,0.0172,0.006,3.036,0.029,0.003,0.032
x2,-0.0485,0.025,-1.978,0.105,-0.112,0.015
x3,0.0269,0.032,0.851,0.434,-0.054,0.108
x4,0.0315,0.032,0.977,0.373,-0.051,0.114
x5,-0.1824,0.034,-5.338,0.003,-0.270,-0.095
x6,0.0346,0.026,1.339,0.238,-0.032,0.101
x7,0.2448,0.039,6.198,0.002,0.143,0.346
x8,-0.0652,0.028,-2.361,0.065,-0.136,0.006
x9,-0.0253,0.032,-0.792,0.464,-0.108,0.057

0,1,2,3
Omnibus:,1.656,Durbin-Watson:,2.054
Prob(Omnibus):,0.437,Jarque-Bera (JB):,1.049
Skew:,-0.613,Prob(JB):,0.592
Kurtosis:,2.738,Cond. No.,1040.0
