In [1]:
# Process Vo data for a final size analysis by age

In [2]:
%matplotlib inline
import numpy as np
import scipy.stats as st
import scipy.optimize as op
import pandas as pd
from numpy import linalg as LA
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('./vo_data.csv')

In [4]:
posi = ((df['first_sampling'].values == 'Positive') | (df['second_sampling'].values == 'Positive'))

In [5]:
# indices taken from the vo_legend file
symptom_indices = range(3,13)
contact_indices = range(13,104)
testday_indices = range(104,123)

In [6]:
hcol = df.household_id.values
hhids = pd.unique(df.household_id)
len(hhids)

1299

In [7]:
hh_tests = []
ages = []
for hid in hhids:
    dfh = df[df.household_id == hid]
    tests = dfh.iloc[:,testday_indices].values
    aa = dfh.iloc[:,2].values
    tests[tests=='Neg'] = 0
    tests[tests=='Pos'] = 1
    hh_tests.append(tests)
    ages.append(aa)

In [8]:
age_gs = pd.unique(df.age_group)
age_gs.sort()
age_gs

array(['00-10', '11-20', '21-30', '31-40', '41-50', '51-60', '61-70',
       '71-80', '81-90', '91+'], dtype=object)

In [9]:
nsamp = np.zeros(len(age_gs))
npos = np.zeros(len(age_gs))

In [10]:
for i, ag in enumerate(age_gs):
    dfa = df[df.age_group == ag]
    nsamp[i] = len(dfa)
    dfp = df[posi]
    dfa = dfp[dfp.age_group == ag]
    npos[i] = len(dfa)

In [11]:
# Dictionary that puts ages in categories
as2rg = {
    '00-10' : 1,
    '11-20' : 1,
    '21-30' : 0, # 0 is reference class for adults - then kids, old
    '31-40' : 0,
    '41-50' : 0,
    '51-60' : 0,
    '61-70' : 0,
    '71-80' : 2,
    '81-90' : 2,
    '91+'   : 2,
}

In [12]:
nages = max(as2rg.values())

In [13]:
Y = [] # To store outcomes
XX = [] # To store design matrices
for i in range(0,len(hhids)):
    mya = [as2rg[a] for a in ages[i]]
    m = len(mya)
    myx = np.zeros((m,nages))
    myy = np.zeros(m)
    for j, a in enumerate(mya):
        if (a>0):
            myx[j,a-1] = 1
        if (np.any(hh_tests[i][j,:]==1)):
            myy[j] = 1
    Y.append(myy)
    XX.append(myx)

In [14]:
# The above processes the data - now add final size analysis; first do a run through

In [15]:
def phi(s, theta=1.0):
    return ((1.0 + theta*s)**(-1.0/theta))

In [16]:
# Indicative parameters - to be included in a 
llaL = 0.1
llaG = -1.6
theta = 1.2
beta = np.array([1.1, 2.0])
gamma = np.array([1.3, 0.5])

In [17]:
nlv = np.zeros(len(hhids)) # Vector of negative log likelihoods
for i in range(0,len(hhids)):
    y = Y[i]
    X = XX[i]
    if np.all(y==0.0):
        nlv[i] = np.exp(llaG)*np.sum(np.exp(beta@(X.T)))
    else:
        # Sort to go zeros then ones WLOG (could do in pre-processing)
        ii = np.argsort(y)
        y = y[ii]
        X = X[ii,:]
        q = sum(y>0)
        r = 2**q
        m = len(y)
        
        # Quantities that don't vary through the sum
        Bk = np.exp(-np.exp(llaG)*np.exp(beta@(X.T)))
        laM = np.exp(llaL)*np.outer(np.exp(beta@(X.T)),np.exp(gamma@(X.T)))
        
        BB = np.zeros((r,r)) # To be the Ball matrix
        for jd in range(0,r):
            for omd in range(0,jd+1):
                jstr = format(jd,'0' + str(m) + 'b')
                omstr = format(omd,'0' + str(m) + 'b')
                j = np.array([int(jstr[x]) for x in range(0,len(jstr))])
                om = np.array([int(omstr[x]) for x in range(0,len(omstr))])
                BB[jd,omd] = 1.0/np.prod((phi((1-j)@laM,theta)**om)*(Bk**(1-j)))
        nlv[i] = -np.log(LA.solve(BB,np.ones(r))[-1])
nll = np.sum(nlv)
nll

1519.9500271330257

In [18]:
# Now try a loop over parameters

In [19]:
# Indicative parameters - to be included in a
x0 = np.array([
    0.1,
    -1.6,
    1.2,
    1.1, 2.0,
    1.3, 0.5,
])

In [20]:
def mynll(x):
    
    try: # Ideally catch the linear algebra fail directly
        llaL = x[0]
        llaG = x[1]
        theta = x[2]
        beta = x[3:(3+nages)]
        gamma = x[(3+nages):]

        nlv = np.zeros(len(hhids)) # Vector of negative log likelihoods
        for i in range(0,len(hhids)):
            y = Y[i]
            X = XX[i]
            if np.all(y==0.0):
                nlv[i] = np.exp(llaG)*np.sum(np.exp(beta@(X.T)))
            else:
                # Sort to go zeros then ones WLOG (could do in pre-processing)
                ii = np.argsort(y)
                y = y[ii]
                X = X[ii,:]
                q = sum(y>0)
                r = 2**q
                m = len(y)

                # Quantities that don't vary through the sum
                Bk = np.exp(-np.exp(llaG)*np.exp(beta@(X.T)))
                laM = np.exp(llaL)*np.outer(np.exp(beta@(X.T)),np.exp(gamma@(X.T)))

                BB = np.zeros((r,r)) # To be the Ball matrix
                for jd in range(0,r):
                    for omd in range(0,jd+1):
                        jstr = format(jd,'0' + str(m) + 'b')
                        omstr = format(omd,'0' + str(m) + 'b')
                        j = np.array([int(jstr[x]) for x in range(0,len(jstr))])
                        om = np.array([int(omstr[x]) for x in range(0,len(omstr))])
                        BB[jd,omd] = 1.0/np.prod((phi((1-j)@laM,theta)**om)*(Bk**(1-j)))
                nlv[i] = -np.log(LA.solve(BB,np.ones(r))[-1])
        nll = np.sum(nlv)
        return nll
    except:
        nll = np.inf
        return nll

In [21]:
fout = op.minimize(mynll,x0,method='Nelder-Mead')

In [22]:
fout.x

array([-3.26316820e+01, -3.97697859e+00,  1.19934276e+03, -1.10752330e+00,
        7.13956139e-01, -1.52450824e+02, -4.04363896e+02])

In [23]:
print('20 yo and under estimated {:.1f}% as susceptible and {:.3f}% as infectious as 21-70yo'.format(
100*np.exp(fout.x[3]),100*np.exp(fout.x[5])))
print('71 yo and over estimated {:.1f}% as susceptible and {:.3f}% as infectious as 21-70yo'.format(
100*np.exp(fout.x[4]),100*np.exp(fout.x[6])))

20 yo and under estimated 33.0% as susceptible and 0.000% as infectious as 21-70yo
71 yo and over estimated 204.2% as susceptible and 0.000% as infectious as 21-70yo


In [24]:
# TODO:
# - theta ?
# - Plots