In [1]:
import pandas as pd # Manejo de bases de datos
import geopandas as gpd # Manejo de bases de datos geográficas
import numpy as np # Funciones numéricas
import matplotlib.pyplot as plt # Gráficas
import seaborn as sns # Gráficas
import datetime as dt
import folium
import unicodedata
import datetime as dt

from math import ceil
from geopandas.tools import sjoin
from unicodedata import normalize

## Read Bogotá spatial characteristics by city block

In [3]:
# Load File
path = '/home/ubuntu/javeriana/MOTUS-PUJ/Step_2/1_spatial/Outputs/ManzanasGDF.gzip'
spatial_Bog = pd.read_pickle(path, compression='gzip')
spatial_Bog = gpd.GeoDataFrame(spatial_Bog)
spatial_Bog.to_crs(epsg=4326, inplace=True)
# Drop cols we wont use
spatial_Bog.drop(columns={'OBJECTID', 'GLOBALID', 'Shape_Leng', 'Shape_Area', 'SECCODIGO'}, inplace=True)

spatial_Bog.head(2)

Unnamed: 0,MANCODIGO,geometry,Hosp_Point,N_Hosp,IPS_Point,N_IPS,Col_Point,N_Col,PlazMer_Point,N_PlazMer,ITur_Point,N_ITur,SITP_Point,N_SITP,Ecomer_Point,N_Ecomer
0,1101001,"POLYGON ((-74.08180 4.58640, -74.08180 4.58640...",[],0,[],0,[],0,[],0,[],0,0,0,"[POINT (-74.0818740288 4.58465360735), POINT (...",11
1,1101002,"POLYGON ((-74.08168 4.58489, -74.08168 4.58490...",[],0,[],0,[],0,[],0,[],0,[POINT (-74.08057397553043 4.585286425284456)],1,"[POINT (-74.08039633520001 4.583938352930001),...",24


In [4]:
# Calculate every city block area 
spatial_Bog_area = spatial_Bog.copy()
spatial_Bog_area.to_crs(epsg=3395, inplace=True)
spatial_Bog_area['geometry'] = spatial_Bog_area['geometry'].buffer(0.00007)

spatial_Bog_area['Area_Km2'] = spatial_Bog_area['geometry'].area/(10**6)
area = spatial_Bog_area['Area_Km2'].values.tolist()

spatial_Bog['Area_km2'] = area

spatial_Bog = spatial_Bog[ ['MANCODIGO', 'Area_km2' ,'geometry', 'N_Hosp', 'N_IPS', 'N_Col', 'N_PlazMer',
                           'N_ITur', 'N_SITP', 'N_Ecomer',] ]

del spatial_Bog_area, area
spatial_Bog.head(2)

Unnamed: 0,MANCODIGO,Area_km2,geometry,N_Hosp,N_IPS,N_Col,N_PlazMer,N_ITur,N_SITP,N_Ecomer
0,1101001,0.012103,"POLYGON ((-74.08180 4.58640, -74.08180 4.58640...",0,0,0,0,0,0,11
1,1101002,0.022616,"POLYGON ((-74.08168 4.58489, -74.08168 4.58490...",0,0,0,0,0,1,24


### Read UTAM (Bogotá administrative division)

In [5]:
## Load UTAM file (Bogotá administrative division) Base DF from clustering
path = "/home/ubuntu/javeriana/MOTUS-PUJ/Step_2/1_spatial/Outputs/partial_spatial_utam.pkl"

utam_bog = pd.read_pickle(path, compression='gzip')
utam_bog = gpd.GeoDataFrame(utam_bog)
utam_bog.crs = 'EPSG:4326'
utam_bog.to_crs(epsg=4326, inplace=True)
utam_bog.head(2)

Unnamed: 0,LOCNombre,PopLoc,UTAM,LOCid,ESTRATOPre,HOGARES,UTAMNombre,UTAMArea,PopDen[p/km2],geometry,originated Trips,received Trips,N_Hosp,N_IPS,N_Col,N_PlazMer,N_ITur,N_SITP,N_Ecomer
0,usaquen,571268.0,1,1,6.0,913.0,PASEO DE LOS LIBERTADORES,6301165.0,540.118849,"POLYGON ((-74.02609 4.82311, -74.02571 4.82311...",163,166,0,0,0,0,0,2,416
1,suba,1252675.0,2,11,6.0,891.0,LA ACADEMIA,6711816.0,527.173778,"POLYGON ((-74.03849 4.79952, -74.04214 4.77702...",287,286,0,1,13,0,0,12,73


In [6]:
# Count total trips in UTAM and drop features we wont use
utam_bog['total_trips'] = utam_bog['originated Trips'] + utam_bog['received Trips']
utam_bog.drop(columns={'N_PlazMer', 'N_ITur', 'N_SITP', 'N_Ecomer', 'N_Hosp', 'N_IPS', 'N_Col',
                      'originated Trips', 'received Trips'}, inplace=True)

### Perform spatial join between UTAM and city blocks

In [7]:
# Perform spatial join 
join_spatial_utam = sjoin(spatial_Bog, utam_bog, how="left", op="within")
join_spatial_utam.head(2)

Unnamed: 0,MANCODIGO,Area_km2,geometry,N_Hosp,N_IPS,N_Col,N_PlazMer,N_ITur,N_SITP,N_Ecomer,...,LOCNombre,PopLoc,UTAM,LOCid,ESTRATOPre,HOGARES,UTAMNombre,UTAMArea,PopDen[p/km2],total_trips
0,1101001,0.012103,"POLYGON ((-74.08180 4.58640, -74.08180 4.58640...",0,0,0,0,0,0,11,...,,,,,,,,,,
1,1101002,0.022616,"POLYGON ((-74.08168 4.58489, -74.08168 4.58490...",0,0,0,0,0,1,24,...,,,,,,,,,,


In [8]:
# Drop city blocks that are not within any utam in Bogotá
join_spatial_utam = join_spatial_utam.dropna(how='any', axis=0)
join_spatial_utam.drop(columns={'index_right', 'HOGARES', 'UTAMArea', 'UTAMNombre',
                               'N_Col', 'N_PlazMer', 'LOCNombre'}, inplace=True)

join_spatial_utam['LOCid'] = pd.to_numeric(join_spatial_utam['LOCid'], downcast='integer')
join_spatial_utam.head(3)

Unnamed: 0,MANCODIGO,Area_km2,geometry,N_Hosp,N_IPS,N_ITur,N_SITP,N_Ecomer,PopLoc,UTAM,LOCid,ESTRATOPre,PopDen[p/km2],total_trips
2,1101003,0.016304,"POLYGON ((-74.08202 4.58425, -74.08201 4.58427...",0,0,0,0,10,401060.0,33.0,4,3.0,19061.108312,1898.0
3,1101004,0.006066,"POLYGON ((-74.08215 4.58382, -74.08215 4.58382...",0,0,0,0,5,401060.0,33.0,4,3.0,19061.108312,1898.0
4,1101005,0.00995,"POLYGON ((-74.08223 4.58338, -74.08223 4.58338...",0,0,0,1,12,401060.0,33.0,4,3.0,19061.108312,1898.0


In [9]:
# Adjust population Density because we want to consider 100% population of Bogotá
Loc_code = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19] # ids for each locality
Loc_Density = []
for i in range(len(Loc_code)):
    temp = join_spatial_utam[ join_spatial_utam['LOCid'] == Loc_code[i] ]
    temp.reset_index(drop=True, inplace=True)
    temp_pop = temp.loc[0, 'PopLoc']
    
    temp_list = temp['Area_km2'].values.tolist()
    area_mean = sum(temp_list)
    
    temp_popDen = (temp_pop)/(area_mean)
    
    Loc_Density.append(temp_popDen)
    
#Function to adjust density
def AdjustDensity(Loc_Density, LOCid):
    temp = Loc_Density[LOCid - 1]
    return temp

join_spatial_utam['PopDen[p/km2]'] = join_spatial_utam.apply(lambda row: AdjustDensity(Loc_Density, 
                                                                                       row['LOCid']), axis=1)
join_spatial_utam.head(3)

Unnamed: 0,MANCODIGO,Area_km2,geometry,N_Hosp,N_IPS,N_ITur,N_SITP,N_Ecomer,PopLoc,UTAM,LOCid,ESTRATOPre,PopDen[p/km2],total_trips
2,1101003,0.016304,"POLYGON ((-74.08202 4.58425, -74.08201 4.58427...",0,0,0,0,10,401060.0,33.0,4,3.0,27476.307416,1898.0
3,1101004,0.006066,"POLYGON ((-74.08215 4.58382, -74.08215 4.58382...",0,0,0,0,5,401060.0,33.0,4,3.0,27476.307416,1898.0
4,1101005,0.00995,"POLYGON ((-74.08223 4.58338, -74.08223 4.58338...",0,0,0,1,12,401060.0,33.0,4,3.0,27476.307416,1898.0


In [10]:
# Estimate the number of people living in every city Block.

join_spatial_utam['ppl_block'] = join_spatial_utam['Area_km2']*join_spatial_utam['PopDen[p/km2]']

#Organize cols
join_spatial_utam = join_spatial_utam[ ['MANCODIGO', 'ppl_block', 'Area_km2', 'geometry', 'N_Hosp', 'N_IPS', 'N_ITur',
                                       'N_SITP', 'N_Ecomer', 'UTAM', 'LOCid', 'PopLoc' ,'ESTRATOPre',
                                       'PopDen[p/km2]', 'total_trips'] ]

join_spatial_utam.head(2)

Unnamed: 0,MANCODIGO,ppl_block,Area_km2,geometry,N_Hosp,N_IPS,N_ITur,N_SITP,N_Ecomer,UTAM,LOCid,PopLoc,ESTRATOPre,PopDen[p/km2],total_trips
2,1101003,447.984151,0.016304,"POLYGON ((-74.08202 4.58425, -74.08201 4.58427...",0,0,0,0,10,33.0,4,401060.0,3.0,27476.307416,1898.0
3,1101004,166.664518,0.006066,"POLYGON ((-74.08215 4.58382, -74.08215 4.58382...",0,0,0,0,5,33.0,4,401060.0,3.0,27476.307416,1898.0


In [11]:
# There are some city blocks that do not enter in any utam
len(spatial_Bog['MANCODIGO']) - len(join_spatial_utam['MANCODIGO'].values.tolist())

2758

In [12]:
#Copy new dataframe from joined DF.
spatial_block = join_spatial_utam.copy()
# cast Locality id to integer
spatial_block['LOCid'] = pd.to_numeric(spatial_block['LOCid'], downcast='integer')
spatial_block.reset_index(drop=True, inplace=True)
del join_spatial_utam

spatial_block.head(2)

Unnamed: 0,MANCODIGO,ppl_block,Area_km2,geometry,N_Hosp,N_IPS,N_ITur,N_SITP,N_Ecomer,UTAM,LOCid,PopLoc,ESTRATOPre,PopDen[p/km2],total_trips
0,1101003,447.984151,0.016304,"POLYGON ((-74.08202 4.58425, -74.08201 4.58427...",0,0,0,0,10,33.0,4,401060.0,3.0,27476.307416,1898.0
1,1101004,166.664518,0.006066,"POLYGON ((-74.08215 4.58382, -74.08215 4.58382...",0,0,0,0,5,33.0,4,401060.0,3.0,27476.307416,1898.0


In [13]:
#Calculate population density
spatial_block['BlockPopDen'] = (spatial_block['ppl_block'])/(spatial_block['Area_km2'])
spatial_block.drop(columns={'PopDen[p/km2]'} ,inplace=True) #Drop col

# Organize cols
spatial_block = spatial_block[ ['MANCODIGO', 'ppl_block', 'Area_km2', 'BlockPopDen' ,'geometry', 'N_Hosp', 
                                'N_IPS','N_ITur', 'N_SITP', 'N_Ecomer', 'UTAM', 'LOCid', 'ESTRATOPre', 
                                'total_trips', 'PopLoc'] ]

spatial_block.head(3)

Unnamed: 0,MANCODIGO,ppl_block,Area_km2,BlockPopDen,geometry,N_Hosp,N_IPS,N_ITur,N_SITP,N_Ecomer,UTAM,LOCid,ESTRATOPre,total_trips,PopLoc
0,1101003,447.984151,0.016304,27476.307416,"POLYGON ((-74.08202 4.58425, -74.08201 4.58427...",0,0,0,0,10,33.0,4,3.0,1898.0,401060.0
1,1101004,166.664518,0.006066,27476.307416,"POLYGON ((-74.08215 4.58382, -74.08215 4.58382...",0,0,0,0,5,33.0,4,3.0,1898.0,401060.0
2,1101005,273.385401,0.00995,27476.307416,"POLYGON ((-74.08223 4.58338, -74.08223 4.58338...",0,0,0,1,12,33.0,4,3.0,1898.0,401060.0


In [14]:
# Save classification base DataFrame
spatial_block.to_pickle('/home/ubuntu/javeriana/MOTUS-PUJ/Step_2/2_ML_Preps/Outputs/partial_spatial_block.pkl', compression='gzip')

### Active Cases Count

In [16]:
# Load file with reported cases 
fechas_df = pd.read_csv('/home/ubuntu/javeriana/MOTUS-PUJ/Step_1/Outputs/fechas_sintomas.csv')
fechas_df = fechas_df.rename(columns={'LOCALIDAD_ASIS': 'CASOS_REPORTADOS'})#Rename Column

#Change str to datetime objects
fechas_df['FECHA_DE_INICIO_DE_SINTOMAS'] = pd.to_datetime(fechas_df['FECHA_DE_INICIO_DE_SINTOMAS'], format='%Y-%m-%d')
fechas_df = fechas_df.sort_values(by='FECHA_DE_INICIO_DE_SINTOMAS', ascending=True)
fechas_df = fechas_df.reset_index(drop=True)

#Drop 0 and 21 id codes since they don´t belong to Bogotá geography 
index_0 = fechas_df[ fechas_df['CODIGO_LOCALIDAD'] == 0 ].index
index_21 = fechas_df[ fechas_df['CODIGO_LOCALIDAD'] == 21 ].index

fechas_df.drop(index_0, inplace = True)#drop fuera de bogotá and sin dato rows since we cannot calculate infection 
fechas_df.drop(index_21, inplace = True)# density for them (we don´t know their reference population)

fechas_df = fechas_df.reset_index(drop = True)

In [17]:
com_index = len(fechas_df['FECHA_DE_INICIO_DE_SINTOMAS']) #column length 
start_date = fechas_df.loc[0, 'FECHA_DE_INICIO_DE_SINTOMAS'] #start of the pandemic in Bogotá 
end_date = fechas_df.loc[com_index-1, 'FECHA_DE_INICIO_DE_SINTOMAS'] #Last reported date 

pan_days = end_date - start_date
pan_days = int(pan_days.days) #Days passed since pandemic start to last reported date grid x axis 
local_bog = 20 # number of rows y axis we are not considering 0 and 21 codes 

#create grid and fill it with reported cases y axis correspond to Bogota localities id code, x axis correspond to 
#number of days passed since pandemic started that way 0 index -> 2020-02-06, 1->2020-02-07 and so on

grid = np.ndarray([local_bog, pan_days+1]) #create grid
grid.fill(0) #fill grid with 0 

#Method used to fill grid with reported cases 
def fillGrid(date, code, cases):
    col = date - start_date
    col = int(col.days)
    grid[code-1][col] = cases
    

#Fill grid with cases 
fechas_df.apply(lambda row: fillGrid(row['FECHA_DE_INICIO_DE_SINTOMAS'], row['CODIGO_LOCALIDAD'], int(row['CASOS_REPORTADOS']) ), axis=1)

0        None
1        None
2        None
3        None
4        None
         ... 
11293    None
11294    None
11295    None
11296    None
11297    None
Length: 11298, dtype: object

In [18]:
# Function used to count active cases
def ActiveCases (date, code):
    col = date - start_date
    col = int(col.days)
    if col >= 15:
        Active = sum(grid[code-1][col-15:col+1])
    else: 
        Active = sum(grid[code-1][0:col+1])
    return int(Active)

### Estimated Rt for every locality

In [19]:
# Reading localities R_t
base_path = '/home/ubuntu/javeriana/MOTUS-PUJ/Step_1/RT_outputs/'

loc_R_list = ['usaquen.pkl', 'chapinero.pkl', 'santafe.pkl', 'sancristobal.pkl', 'usme.pkl',
             'tunjuelito.pkl', 'bosa.pkl', 'kennedy.pkl', 'fontibon.pkl', 'engativa.pkl',
             'suba.pkl', 'barriosunidos.pkl', 'teusaquillo.pkl', 'losmartires.pkl', 'antonionariño.pkl',
             'puentearanda.pkl', 'lacandelaria.pkl', 'rafaeluribeuribe.pkl', 'ciudadbolivar.pkl']

R_list = []

for i in range(len(loc_R_list)):
    path_file = base_path+loc_R_list[i]
    R_list.append(pd.read_pickle(path_file))
    
for i in range(len(loc_R_list)):
    R_list[i].reset_index(drop=False, inplace=True)
    
R_df = pd.DataFrame(index = R_list[0]['Time Stamp'])
R_df.reset_index(drop=False, inplace=True)

for i in range(len(loc_R_list)):
    R_df[loc_R_list[i]] = 0
    R_df[loc_R_list[i]] = R_list[i]['R'].tolist()
    
R_df.fillna(0)
R_df.head(5)

Unnamed: 0,Time Stamp,usaquen.pkl,chapinero.pkl,santafe.pkl,sancristobal.pkl,usme.pkl,tunjuelito.pkl,bosa.pkl,kennedy.pkl,fontibon.pkl,engativa.pkl,suba.pkl,barriosunidos.pkl,teusaquillo.pkl,losmartires.pkl,antonionariño.pkl,puentearanda.pkl,lacandelaria.pkl,rafaeluribeuribe.pkl,ciudadbolivar.pkl
0,2020-02-26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.63486,0.0,0.0
1,2020-02-27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.513241,24.513231,0.0,0.0,0.0,0.0,0.0,1.825152,0.0,0.0
2,2020-02-28,25.16846,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.8892,12.864529,0.0,0.0,0.0,0.0,0.0,1.825152,0.0,0.0
3,2020-02-29,13.411527,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.617419,9.256936,9.060387,25.617419,0.0,0.0,0.0,0.0,0.280187,0.0,25.617427
4,2020-03-01,10.052326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.604958,7.168667,6.864376,13.604958,0.0,0.0,0.0,0.0,0.0,0.0,13.604997


### Create specific DF for different dates to perform Classification tests

In [20]:
# Peak contagion dates in Bogotá
peak_dates = ['01/05/2020' ,'01/08/2020', '01/11/2020', '10/01/2021', '01/03/2021', '01/05/2021',
             '10/06/2021', '01/09/2021']

peak_dates2 = ['01-05-2020' ,'01-08-2020', '01-11-2020', '10-01-2021', '01-03-2021', '01-05-2021',
             '10-06-2021', '01-09-2021']

In [21]:
k = 5 #Choose date
selec_date = dt.datetime.strptime(peak_dates[k], "%d/%m/%Y")

In [22]:
#Add Active cases in interest date
Loc_code = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
Active_cases = []
for i in range(len(Loc_code)):
    Active_cases.append(ActiveCases(selec_date, Loc_code[i]))

#### Add active cases and granulate them

In [23]:
#Add Active Cases
spatial_block_date = spatial_block.copy()

spatial_block_date['cases_'+peak_dates2[k]] = spatial_block_date.apply(lambda row: Active_cases[row['LOCid']-1], axis=1)

In [24]:
# Granulate active cases
def single_cases(ppl_block, caseLoc, PobLoc, ID):
    x = (100*caseLoc)/(PobLoc)
    y = (x*ppl_block)/100
    return y

spatial_block_date['block_cases_'+peak_dates2[k]] = spatial_block_date.apply(lambda row: single_cases(row['ppl_block'], 
                                                                                                  row['cases_'+peak_dates2[k]], 
                                                                                                  row['PopLoc'], row['LOCid']
                                                                                                     ), axis=1)

In [25]:
#Organize cols 
spatial_block_date = spatial_block_date[ ['MANCODIGO', 'ppl_block', 'block_cases_'+peak_dates2[k] ,
                                          'Area_km2', 'BlockPopDen', 'geometry', 'N_Hosp', 'N_IPS', 
                                          'N_ITur', 'N_SITP', 'N_Ecomer', 'UTAM', 'LOCid', 'PopLoc', 
                                          'ESTRATOPre', 'total_trips', 'cases_'+peak_dates2[k]] ]

spatial_block_date.head(2)

Unnamed: 0,MANCODIGO,ppl_block,block_cases_01-05-2021,Area_km2,BlockPopDen,geometry,N_Hosp,N_IPS,N_ITur,N_SITP,N_Ecomer,UTAM,LOCid,PopLoc,ESTRATOPre,total_trips,cases_01-05-2021
0,1101003,447.984151,2.455167,0.016304,27476.307416,"POLYGON ((-74.08202 4.58425, -74.08201 4.58427...",0,0,0,0,10,33.0,4,401060.0,3.0,1898.0,2198
1,1101004,166.664518,0.913401,0.006066,27476.307416,"POLYGON ((-74.08215 4.58382, -74.08215 4.58382...",0,0,0,0,5,33.0,4,401060.0,3.0,1898.0,2198


In [26]:
#Compare mean granulated cases with locality total cases they must be the same
mean_cases = []
for i in range(len(Loc_code)):
    temp = spatial_block_date[ spatial_block_date['LOCid'] == Loc_code[i] ]
    temp_list = temp['block_cases_'+peak_dates2[k]].values.tolist()
    cases_mean = sum(temp_list)
    mean_cases.append(cases_mean)

In [27]:
np.array(mean_cases) # Visual confirmation

array([ 5064.,  1857.,   990.,  2198.,  2143.,  1386.,  5140.,  9298.,
        3434.,  7785., 10413.,  1632.,  1817.,   686.,   746.,  2788.,
         345.,  2771.,  3742.])

In [28]:
np.array(Active_cases)

array([ 5064,  1857,   990,  2198,  2143,  1386,  5140,  9298,  3434,
        7785, 10413,  1632,  1817,   686,   746,  2788,   345,  2771,
        3742])

#### Add and granulate Rt score

In [29]:
#Select date
rt_list = R_df[ R_df['Time Stamp'] == selec_date ]
rt_list = rt_list.iloc[:, 1:].values.tolist()[0]

#Add Rt on specific date
spatial_block_date['Rt_'+peak_dates2[k]] = spatial_block_date.apply(lambda row: rt_list[row['LOCid']-1], axis=1)
spatial_block_date.drop(columns={'cases_'+peak_dates2[k], 'PopLoc'}, inplace=True)
spatial_block_date.head(3)

Unnamed: 0,MANCODIGO,ppl_block,block_cases_01-05-2021,Area_km2,BlockPopDen,geometry,N_Hosp,N_IPS,N_ITur,N_SITP,N_Ecomer,UTAM,LOCid,ESTRATOPre,total_trips,Rt_01-05-2021
0,1101003,447.984151,2.455167,0.016304,27476.307416,"POLYGON ((-74.08202 4.58425, -74.08201 4.58427...",0,0,0,0,10,33.0,4,3.0,1898.0,1.042084
1,1101004,166.664518,0.913401,0.006066,27476.307416,"POLYGON ((-74.08215 4.58382, -74.08215 4.58382...",0,0,0,0,5,33.0,4,3.0,1898.0,1.042084
2,1101005,273.385401,1.498282,0.00995,27476.307416,"POLYGON ((-74.08223 4.58338, -74.08223 4.58338...",0,0,0,1,12,33.0,4,3.0,1898.0,1.042084


In [30]:
# Normalize values
spatial_block_date_rt = spatial_block_date.copy()

spatial_block_date_rt['ESTRATOPre'] = spatial_block_date_rt['ESTRATOPre']/(spatial_block_date_rt['ESTRATOPre'].max()*4)
spatial_block_date_rt['BlockPopDen'] = spatial_block_date_rt['BlockPopDen']/(spatial_block_date_rt['BlockPopDen'].max()*4)
spatial_block_date_rt['N_Hosp'] = spatial_block_date_rt['N_Hosp']/(spatial_block_date_rt['N_Hosp'].max()*4)
spatial_block_date_rt['N_IPS'] = spatial_block_date_rt['N_IPS']/(spatial_block_date_rt['N_IPS'].max()*4)
spatial_block_date_rt['N_ITur']  = spatial_block_date_rt['N_ITur']/(spatial_block_date_rt['N_ITur'].max()*4)
spatial_block_date_rt['N_SITP'] = spatial_block_date_rt['N_SITP']/(spatial_block_date_rt['N_SITP'].max()*4)
spatial_block_date_rt['N_Ecomer'] = spatial_block_date_rt['N_Ecomer']/(spatial_block_date_rt['N_Ecomer'].max()*4)
spatial_block_date_rt['total_trips'] = spatial_block_date_rt['total_trips']/(spatial_block_date_rt['total_trips'].max()*4)
spatial_block_date_rt['block_cases_'+peak_dates2[k]] = spatial_block_date_rt['block_cases_'+peak_dates2[k]]/(spatial_block_date_rt['block_cases_'+peak_dates2[k]].max()*3)

In [31]:
#Function used to granulate RT
def singularizeRT(estrato, popden, hosp, ips, itur, sitp, comer, trips, cases, RT):
    a_rt = RT-estrato+popden-hosp-ips+itur+sitp+comer+trips+cases
    return a_rt

In [32]:
# Granulate RT
spatial_block_date_rt['BlockRT_'+peak_dates2[k]] = spatial_block_date_rt.apply(lambda row: singularizeRT(row['ESTRATOPre'], 
                                                                                       row['BlockPopDen'], row['N_Hosp'],
                                                                                       row['N_IPS'], row['N_ITur'],
                                                                                       row['N_SITP'], row['N_Ecomer'],
                                                                                       row['total_trips'], 
                                                                                       row['block_cases_'+peak_dates2[k]],
                                                                                       row['Rt_'+peak_dates2[k]]), axis=1)

In [33]:
# Compare mean Adjusted Rt vs Locality Rt

mean_rt = []
for i in range(len(Loc_code)):
    temp = spatial_block_date_rt[ spatial_block_date_rt['LOCid'] == Loc_code[i] ]
    temp_list = temp['BlockRT_'+peak_dates2[k]].values.tolist()
    cases_mean = sum(temp_list)/len(temp_list)
    mean_rt.append(cases_mean)

In [34]:
np.array(rt_list)#Visual comparisson

array([1.00005381, 1.31706153, 1.45197602, 1.04208374, 1.55610715,
       1.48157127, 1.52465447, 1.29030878, 1.51353866, 1.50776765,
       1.46082761, 0.96294596, 1.40245065, 1.26039779, 1.45904109,
       1.31570456, 0.97931723, 1.30268714, 1.64291863])

In [35]:
np.array(mean_rt)

array([1.03776118, 1.39815139, 1.58746401, 1.18819917, 1.73992754,
       1.66028235, 1.77825854, 1.44297256, 1.56025244, 1.6251941 ,
       1.57392495, 1.01005248, 1.4150649 , 1.34534605, 1.55464515,
       1.36677312, 1.09205717, 1.46156482, 1.8350358 ])

In [36]:
# Mean Square Error function
def ECM(R_Bog, R_Al):
    ecm = 0
    for i in range(len(R_Bog)):
        ecm += (R_Al[i] - R_Bog[i])**2
    ecm = ecm/len(R_Bog)
    return ecm 

ECM(mean_rt, rt_list) #MSE between granulated RT and localities RT

0.017158895812402753

In [37]:
# generate final normalized DF with an estimated rt for each city blocks

spatial_block_date['BlockRT_'+peak_dates2[k]] = spatial_block_date_rt['BlockRT_'+peak_dates2[k]]

spatial_block_date['ESTRATOPre'] = spatial_block_date['ESTRATOPre']/(spatial_block_date['ESTRATOPre'].max())
spatial_block_date['BlockPopDen'] = spatial_block_date['BlockPopDen']/(spatial_block_date['BlockPopDen'].max())
spatial_block_date['N_Hosp'] = spatial_block_date['N_Hosp']/(spatial_block_date['N_Hosp'].max())
spatial_block_date['N_IPS'] = spatial_block_date['N_IPS']/(spatial_block_date['N_IPS'].max())
spatial_block_date['N_ITur']  = spatial_block_date['N_ITur']/(spatial_block_date['N_ITur'].max())
spatial_block_date['N_SITP'] = spatial_block_date['N_SITP']/(spatial_block_date['N_SITP'].max())
spatial_block_date['N_Ecomer'] = spatial_block_date['N_Ecomer']/(spatial_block_date['N_Ecomer'].max())
spatial_block_date['total_trips'] = spatial_block_date['total_trips']/(spatial_block_date['total_trips'].max())
spatial_block_date['block_cases_'+peak_dates2[k]] = spatial_block_date['block_cases_'+peak_dates2[k]]/(spatial_block_date['block_cases_'+peak_dates2[k]].max())
spatial_block_date['BlockRT_'+peak_dates2[k]] = spatial_block_date['BlockRT_'+peak_dates2[k]]/(spatial_block_date['BlockRT_'+peak_dates2[k]].max())


In [38]:
spatial_block_date.drop(columns={'Rt_'+peak_dates2[k]}, inplace=True) #drop unwanted col

In [62]:
# Save DFs to test classification schemes 
df_file_name = 'date_'+peak_dates2[k]+'.pkl'
spatial_block_date.to_pickle('/home/ubuntu/javeriana/MOTUS-PUJ/Step_2/2_ML_Preps/Test_Classification_DF/'+df_file_name, compression='gzip')