**In this notebook:** 
    
    - I aggregate the intermediate contact matrix stratified on age and SEP only, or age and EDU only
    - I compute the analytical constraints for the synthetic expansion

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.gridspec as gridspec
from matplotlib import cm
import math
import random

import geopandas as gp 
from shapely.geometry import Point, LineString

import datetime as dt

import time

import scipy.stats as st

import seaborn as sns

# read population data

In [2]:
data_pop_grouped = pd.read_csv("./../data/pop_size_by_age_SEP_edu.csv")

In [3]:
dict_pop_full = dict(zip(data_pop_grouped[['age_group', 
                           'sep_level', 'edu_level']].apply(lambda x: 
                                               x[0]+", "+x[1]+", "+x[2], axis=1).values, 
                   data_pop_grouped['population']))

dict_pop_full

{'0-14, high SEP, high edu': 0.0,
 '0-14, low SEP, high edu': 0.0,
 '0-14, high SEP, low edu': 910215.0,
 '0-14, low SEP, low edu': 413202.0,
 '15-24, high SEP, high edu': 51812.0,
 '15-24, low SEP, high edu': 24823.0,
 '15-24, high SEP, low edu': 552631.0,
 '15-24, low SEP, low edu': 264751.0,
 '25-64, high SEP, high edu': 1687317.3094791055,
 '25-64, low SEP, high edu': 536759.268093747,
 '25-64, high SEP, low edu': 1727216.6905208929,
 '25-64, low SEP, low edu': 938743.7319062528,
 '65+, high SEP, high edu': 349133.05628178443,
 '65+, low SEP, high edu': 111064.1150329906,
 '65+, high SEP, low edu': 799172.9437182155,
 '65+, low SEP, low edu': 427282.8849670094}

In [4]:
pop_age = data_pop_grouped.groupby('age_group')['population'].sum()
pop_age = dict(zip(pop_age.index, pop_age.values))
pop_age

{'0-14': 1323417.0,
 '15-24': 894017.0,
 '25-64': 4890036.999999998,
 '65+': 1686653.0}

In [5]:
distrib_pop = data_pop_grouped.groupby('age_group')['population'].sum()/data_pop_grouped['population'].sum()
distrib_pop = dict(zip(distrib_pop.index, distrib_pop.values))
distrib_pop

{'0-14': 0.1504887809178038,
 '15-24': 0.10166072254610012,
 '25-64': 0.5560573173632757,
 '65+': 0.19179317917282043}

# compute intermediate matrices

In [6]:
intermediate_matrix_rec = pd.read_csv("./../output/matrices/intermediate_matrix_rec.csv", index_col = 0)

In [7]:
intermediate_matrix_rec

Unnamed: 0_level_0,0-14,15-24,25-64,65+
tag_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"0-14, low SEP, low edu",5.565127,0.518313,3.383215,0.485089
"0-14, low SEP, high edu",,,,
"0-14, high SEP, low edu",5.532796,0.512616,3.384115,0.424194
"0-14, high SEP, high edu",,,,
"15-24, low SEP, low edu",0.923322,3.462194,3.906265,0.967767
"15-24, low SEP, high edu",1.324694,4.332172,6.893188,2.946244
"15-24, high SEP, low edu",0.708418,2.784684,3.386128,0.703637
"15-24, high SEP, high edu",0.230279,2.296604,3.934065,1.18958
"25-64, low SEP, low edu",0.900639,0.639412,3.568599,0.853838
"25-64, low SEP, high edu",0.98043,0.608258,4.201352,1.051674


In [8]:
intermediate_matrix_rec_SEP_only = intermediate_matrix_rec.loc[['0-14, low SEP, low edu',
                                                                 '0-14, high SEP, low edu']].copy()
intermediate_matrix_rec_SEP_only.index = ['0-14, low SEP', '0-14, high SEP']


for age_group in ['15-24', '25-64', '65+']:
    for sep in ['low SEP', 'high SEP']:
        tags = []
        for edu in ['low edu', 'high edu']:
            tags.append(age_group + ', ' + sep + ', ' + edu)
        
        sub = intermediate_matrix_rec.loc[tags]
        
        weights = dict(zip(tags, [dict_pop_full[t] for t in tags]))

        # Compute the weighted mean for each column
        weighted_mean = (sub.T * sub.index.map(weights)).T
        weighted_mean = weighted_mean.sum(axis=0) / sum(weights.values())
        df = pd.DataFrame(weighted_mean).T

        df.index = [age_group + ', ' + sep]

        intermediate_matrix_rec_SEP_only = pd.concat((intermediate_matrix_rec_SEP_only, df))
        
intermediate_matrix_rec_SEP_only

Unnamed: 0,0-14,15-24,25-64,65+
"0-14, low SEP",5.565127,0.518313,3.383215,0.485089
"0-14, high SEP",5.532796,0.512616,3.384115,0.424194
"15-24, low SEP",0.957729,3.536771,4.162311,1.137367
"15-24, high SEP",0.667433,2.742846,3.433096,0.745291
"25-64, low SEP",0.929665,0.628079,3.798782,0.925807
"25-64, high SEP",0.909787,0.689311,3.7132,0.770119
"65+, low SEP",0.280878,0.37593,2.364715,1.582647
"65+, high SEP",0.379113,0.502877,2.370962,1.412901


In [9]:
intermediate_matrix_rec_EDU_only = intermediate_matrix_rec.loc[['0-14, low SEP, high edu']].copy()
intermediate_matrix_rec_EDU_only.index = ['0-14, high edu']

for age_group in ['0-14']:
    for edu in ['low edu']:
        tags = []
        for sep in ['low SEP', 'high SEP']:
            tags.append(age_group + ', ' + sep + ', ' + edu)
        
        sub = intermediate_matrix_rec.loc[tags]
        
        weights = dict(zip(tags, [dict_pop_full[t] for t in tags]))

        # Compute the weighted mean for each column
        weighted_mean = (sub.T * sub.index.map(weights)).T
        weighted_mean = weighted_mean.sum(axis=0) / sum(weights.values())
        df = pd.DataFrame(weighted_mean).T

        df.index = [age_group + ', ' + edu]

        intermediate_matrix_rec_EDU_only = pd.concat((intermediate_matrix_rec_EDU_only, df))
        

for age_group in ['15-24', '25-64', '65+']:
    for edu in ['low edu', 'high edu']:
        tags = []
        for sep in ['low SEP', 'high SEP']:
            tags.append(age_group + ', ' + sep + ', ' + edu)
        
        sub = intermediate_matrix_rec.loc[tags]
        
        weights = dict(zip(tags, [dict_pop_full[t] for t in tags]))

        # Compute the weighted mean for each column
        weighted_mean = (sub.T * sub.index.map(weights)).T
        weighted_mean = weighted_mean.sum(axis=0) / sum(weights.values())
        df = pd.DataFrame(weighted_mean).T

        df.index = [age_group + ', ' + edu]

        intermediate_matrix_rec_EDU_only = pd.concat((intermediate_matrix_rec_EDU_only, df))
        
intermediate_matrix_rec_EDU_only = intermediate_matrix_rec_EDU_only.loc[['0-14, low edu', '0-14, high edu', 
                                       '15-24, low edu', '15-24, high edu',
                                       '25-64, low edu', '25-64, high edu',
                                       '65+, low edu', '65+, high edu']].copy()

intermediate_matrix_rec_EDU_only 

Unnamed: 0,0-14,15-24,25-64,65+
"0-14, low edu",5.542891,0.514395,3.383834,0.443207
"0-14, high edu",,,,
"15-24, low edu",0.778026,3.00413,3.554601,0.789189
"15-24, high edu",0.584774,2.955949,4.892561,1.758585
"25-64, low edu",0.856804,0.675194,3.676609,0.844908
"25-64, high edu",0.986486,0.66561,3.813839,0.783757
"65+, low edu",0.214811,0.390491,2.208088,1.400622
"65+, high edu",0.702072,0.653888,2.797726,1.644199


# compute analytical constraints (SEP dimension)

In [10]:
for age_group in ['0-14', '15-24', '25-64', '65+']:
    for edu in ['low edu', 'high edu']:
        tags = []
        for sep in ['low SEP', 'high SEP']:
            tags.append(age_group + ', ' + sep + ', ' + edu)
        
        dict_pop_full[age_group + ', ' + edu] = sum([dict_pop_full[t] for t in tags])
        
for age_group in ['0-14', '15-24', '25-64', '65+']:
    for sep in ['low SEP', 'high SEP']:
        tags = []
        for edu in ['low edu', 'high edu']:
            tags.append(age_group + ', ' + sep + ', ' + edu)
        
        dict_pop_full[age_group + ', ' + sep] = sum([dict_pop_full[t] for t in tags])
        
dict_pop_full

{'0-14, high SEP, high edu': 0.0,
 '0-14, low SEP, high edu': 0.0,
 '0-14, high SEP, low edu': 910215.0,
 '0-14, low SEP, low edu': 413202.0,
 '15-24, high SEP, high edu': 51812.0,
 '15-24, low SEP, high edu': 24823.0,
 '15-24, high SEP, low edu': 552631.0,
 '15-24, low SEP, low edu': 264751.0,
 '25-64, high SEP, high edu': 1687317.3094791055,
 '25-64, low SEP, high edu': 536759.268093747,
 '25-64, high SEP, low edu': 1727216.6905208929,
 '25-64, low SEP, low edu': 938743.7319062528,
 '65+, high SEP, high edu': 349133.05628178443,
 '65+, low SEP, high edu': 111064.1150329906,
 '65+, high SEP, low edu': 799172.9437182155,
 '65+, low SEP, low edu': 427282.8849670094,
 '0-14, low edu': 1323417.0,
 '0-14, high edu': 0.0,
 '15-24, low edu': 817382.0,
 '15-24, high edu': 76635.0,
 '25-64, low edu': 2665960.4224271458,
 '25-64, high edu': 2224076.5775728524,
 '65+, low edu': 1226455.828685225,
 '65+, high edu': 460197.171314775,
 '0-14, low SEP': 413202.0,
 '0-14, high SEP': 910215.0,
 '15-24

## q_v1^(i,j)

In [11]:
### analitycal condition

## q > 1 - p4*d / p1*a 
## q < 1 - p4*d / p1*a + p2*b / p1*a

for s in [['15-24', '25-64'],
          ['15-24', '65+'],
          ['25-64', '65+'],
          ['0-14', '15-24'],
          ['0-14', '25-64'],
          ['0-14', '65+']]:

    age_1 = s[0]
    age_2 = s[1]

    p1 = dict_pop_full['{}, low SEP'.format(age_1)]
    p2 = dict_pop_full['{}, high SEP'.format(age_1)]
    p3 = dict_pop_full['{}, low SEP'.format(age_2)]
    p4 = dict_pop_full['{}, high SEP'.format(age_2)]

    a = intermediate_matrix_rec_SEP_only.loc['{}, low SEP'.format(age_1)][age_2]
    b = intermediate_matrix_rec_SEP_only.loc['{}, high SEP'.format(age_1)][age_2]
    c = intermediate_matrix_rec_SEP_only.loc['{}, low SEP'.format(age_2)][age_1]
    d = intermediate_matrix_rec_SEP_only.loc['{}, high SEP'.format(age_2)][age_1]

    print(age_1, age_2)
    print(np.around(1-(p4*d/(p1*a)),4), np.around(1-(p4*d/(p1*a))+(p2*b/(p1*a)),4))  ## q min and q max 
    print( )

15-24 25-64
-0.9528 0.7689

15-24 65+
-0.7533 0.6145

25-64 65+
-0.9931 0.9319

0-14 15-24
-0.8837 1.2949

0-14 25-64
-1.2222 0.9812

0-14 65+
-1.1719 0.7544



## q_v1^i

In [12]:
## condition on q (analytical derivation)
## q > 1 - (b/a)(pop_12/pop_11) , where b = intermediate_matrix_rec.iloc[0:2]['15-24'].iloc[1]
##                                  and a = intermediate_matrix_rec.iloc[0:2]['15-24'].iloc[0]

for age in ['0-14', '15-24', '25-64', '65+']:

    i = '{}, low SEP'.format(age) ## 11
    j = '{}, high SEP'.format(age)  ### 12

    q1_min = 1-(intermediate_matrix_rec_SEP_only.loc[j][age]/intermediate_matrix_rec_SEP_only.loc[i][age])*(dict_pop_full[j]/dict_pop_full[i])
    
    print(age, np.around(q1_min,4))

0-14 -1.19
15-24 -0.6188
25-64 -1.262
65+ -0.9042


# compute analytical constraints (EDU dimension)

## q_d1^(i,j)

In [13]:
### analitycal condition

## q > 1 - p4*d / p1*a 
## q < 1 - p4*d / p1*a + p2*b / p1*a

for s in [['15-24', '25-64'],
          ['15-24', '65+'],
          ['25-64', '65+'],
          ['0-14', '15-24'],
          ['0-14', '25-64'],
          ['0-14', '65+']]:

    age_1 = s[0]
    age_2 = s[1]

    p1 = dict_pop_full['{}, low edu'.format(age_1)]
    p2 = dict_pop_full['{}, high edu'.format(age_1)]
    p3 = dict_pop_full['{}, low edu'.format(age_2)]
    p4 = dict_pop_full['{}, high edu'.format(age_2)]

    a = intermediate_matrix_rec_EDU_only.loc['{}, low edu'.format(age_1)][age_2]
    b = intermediate_matrix_rec_EDU_only.loc['{}, high edu'.format(age_1)][age_2]
    c = intermediate_matrix_rec_EDU_only.loc['{}, low edu'.format(age_2)][age_1]
    d = intermediate_matrix_rec_EDU_only.loc['{}, high edu'.format(age_2)][age_1]

    print(age_1, age_2)
    print(np.around(1-(p4*d/(p1*a)),4), np.around(1-(p4*d/(p1*a))+(p2*b/(p1*a)),4))  ## q min and q max 
    print( )

15-24 25-64
0.4905 0.6195

15-24 65+
0.5335 0.7424

25-64 65+
0.4284 1.2023

0-14 15-24
0.9342 nan

0-14 25-64
0.5101 nan

0-14 65+
0.4492 nan



## q_d1^i

In [14]:
## condition on q (analytical derivation)
## q > 1 - (b/a)(pop_12/pop_11) , where b = intermediate_matrix_rec.iloc[0:2]['15-24'].iloc[1]
##                                  and a = intermediate_matrix_rec.iloc[0:2]['15-24'].iloc[0]

for age in ['0-14', '15-24', '25-64', '65+']:

    i = '{}, low edu'.format(age) ## 11
    j = '{}, high edu'.format(age)  ### 12

    q1_min = 1-(intermediate_matrix_rec_EDU_only.loc[j][age]/intermediate_matrix_rec_EDU_only.loc[i][age])*(dict_pop_full[j]/dict_pop_full[i])
    
    print(age, np.around(q1_min,4))

0-14 nan
15-24 0.9077
25-64 0.1346
65+ 0.5595
