# Imports

In [None]:
%load_ext autoreload
%autoreload 2

import os
import glob
import time
from collections import OrderedDict
from itertools import product
import joblib
import calendar
import numpy as np
import pandas as pd
from IPython.display import display

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("seaborn-whitegrid")
colors = [x['color'] for x in plt.style.library['seaborn']['axes.prop_cycle']]

import dsttools
import moments
import datafigs

In [None]:
# create folders if necessary
if not os.path.exists('moments'): os.makedirs('moments')  
if not os.path.exists('moments/samples'): os.makedirs('moments/samples')

# Load

In [None]:
sample = 'all'
LOAD = False
DO_BOOTSTRAP = True
DO_FIGS = True
WINDSORIZE = 0.001
%time df = pd.read_parquet(f'data/final_{sample}.parquet')

In [None]:
#df = df[(df.age >= 40) & (df.age <= 55)]

**Settings:**

In [None]:
# a. moments
class par: None
par.ks = [1,2,3,4,5,6] # length of growth rates (multiples of 12)
par.ls = [1,2,3,4,5,6] # difference in covariances (multiples of 12)

par.ks1 = [1] # for monthly
par.ls1 = [1]

par.ks_level = [1,2,3,4,5,6] # for levels

# bounds when calculates shares
etas = np.array([0.50,0.40,0.30,0.20,0.10,0.05,0.04,0.03,0.02,0.01,5*1e-3,1e-3,1e-4])
par.etas_leq = np.concatenate((etas,-np.flip(etas)))
par.etas_leq_midrange = par.etas_leq

par.etas_leq_d1ky = par.etas_leq
par.etas_leq_d1ky_midrange = par.etas_leq

par.etas_cdf = np.logspace(-4,np.log(150)/np.log(10),50)
par.etas_cdf = np.flip(np.concatenate((-np.flip(par.etas_cdf),par.etas_cdf)))/100
par.eta_cond_midrange = 0.01

par.noseason_months = [2,3,8,9,10,11]
par.noseason_months_lag = [month for month in par.noseason_months if month-1 in par.noseason_months]
print('no season months with lag: [' + ''.join(f'{m} ' for m in par.noseason_months_lag) + ']')

par.ages = np.arange(df.age.min(),df.age.max()+1,1,dtype=np.int64) # ages for computing moments
print(f'ages = [{par.ages[0]},{par.ages[1]},...,{par.ages[-1]}]')

# b. bootstrap
min_boot = 0
max_boot = 200

In [None]:
assert np.all(np.diff(par.etas_leq) < 0)
assert np.all(np.diff(par.etas_leq_midrange) < 0)
assert np.all(np.diff(par.etas_cdf) < 0)

# Full-size dataset

In [None]:
def create_full_size():
    
    # a. pnrs
    pnrs = df.index.levels[0]
    par.N = pnrs.size
    
    np.savetxt(f'moments/N_{sample}.txt',np.array([par.N]),delimiter=',',fmt='%d')
    
    # b. number of periods
    par.T = df.groupby('pnr').birthyear.count().max()
    
    if LOAD:    
    
        return df
    
    else:
        
        # c. years
        min_year = df.index.get_level_values('year').min()
        max_year = df.index.get_level_values('year').max()
        years = np.arange(min_year,max_year+1,1)

        # d. complete
        df_full = pd.DataFrame({'pnr':np.repeat(pnrs,par.T),
                                'year':np.tile(np.repeat(years,12),par.N),
                                'month':np.tile(np.tile(np.arange(1,12+1),years.size),par.N)
                                })
        
        df_full['t'] = 12*(df_full.year-min_year) + df_full.month
        df_full = df_full.set_index(['pnr','year','month'])

        # e. join with loaded data
        df_full = df_full.join(df[['wage_narrow','unemployed','date']],how='left')
        df_full.loc[df_full.unemployed.isna(),'unemployed'] = False

        # f. birthyear and age
        birthyear = df.groupby('pnr').birthyear.first().astype('int64')
        birthyear.name = 'birthyear'
        df_full = df_full.join(birthyear,how='left',on='pnr')
        df_full['age'] = df_full.index.get_level_values('year') - df_full.birthyear
        df_full['out_of_sample'] = (df_full.age < par.ages[0]) | (df_full.age > par.ages[-1])
        
        # g. zero income for unemployed
        df_full.loc[df_full.unemployed,'wage_narrow'] = 0
        
        return df_full
    
t0 = time.time()
df = create_full_size()    
t1 = time.time()
print(f'full-size data set created in {t1-t0:.1f} secs')

# Growth rates

In [None]:
def take_logs():    
    
    t0 = time.time()
    
    df['y'] = np.nan
    I = df.wage_narrow > 0 # note: unemployed have forced zero income
    df.loc[I,'y'] = np.log(df[I].wage_narrow)        
    
    for k in par.ks_level:
        df[f'y_lead{12*k}'] = df.groupby('pnr').y.shift(-12*k)

    t1 = time.time()
    print(f'logs taken in {t1-t0:.1f} secs')
    
def calc_growth_rate():
    
    # a. main
    for k in par.ks:
        
        t0 = time.time()
           
        df[f'd{k*12}y'] = df.groupby('pnr').y.diff(12*k)
        
        if WINDSORIZE > 0:
            cutoffs = df[f'd{k*12}y'].quantile([WINDSORIZE,1-WINDSORIZE]).values
            df[f'd{k*12}yw'] = df[f'd{k*12}y'].clip(cutoffs[0],cutoffs[1])
    
        t1 = time.time()
        print(f'12k-month growth rate calculated, k = {k}, in {t1-t0:.1f} secs')
           
    # b. monthly
    for k in par.ks1:
        
        t0 = time.time()

        df[f'd{k}y'] = df.groupby('pnr').y.diff(k)
        if WINDSORIZE > 0:
            cutoffs = df[f'd{k}y'].quantile([WINDSORIZE,1-WINDSORIZE]).values
            df[f'd{k}yw'] = df[f'd{k}y'].clip(cutoffs[0],cutoffs[1])        
       
        t1 = time.time()
        print(f'monthly growth rate calculated, k = {k}, in {t1-t0:.1f} secs')        
    
    # c. additional
    t0 = time.time()

    df['d12y_lag'] = df.groupby('pnr').d12y.shift(12)
    df['d1y_lag'] = df.groupby('pnr').d12y.shift(1)
    
    t1 = time.time()
    
    print(f'lagged growth rates calculated, in {t1-t0:.1f} secs')
        
if not LOAD:     
    
    take_logs()
    calc_growth_rate()

## Make selections

In [None]:
def selection():    
    
    t0 = time.time()
    
    # a. everyone
    df['everyone'] = df.y.notna()
    
    # b. no-season
    df['noseason'] = df.everyone & df.index.get_level_values('month').isin(par.noseason_months)
    
    # c. selected
    df['midrange_d12y'] = df.everyone & df.d12y_lag.between(-par.eta_cond_midrange,par.eta_cond_midrange)
    _temp = df.everyone & df.index.get_level_values('month').isin(par.noseason_months_lag)
    df['midrange_d1y'] = _temp & df.d1y_lag.between(-par.eta_cond_midrange,par.eta_cond_midrange)

    t1 = time.time()
    
    print(f'selection made in {t1-t0:.1f} secs')
    
if not LOAD:
    
    selection()
    print(f'everyone, {df.everyone.mean()*100:.1f} percent')
    print(f'midrange_d12y, {df.midrange_d12y.mean()*100:.1f} percent')
    print(f'noseason, {df.noseason.mean()*100:.1f} percent')
    print(f'midrange_d1y, {df.midrange_d1y.mean()*100:.1f} percent')
    

# Overview

## Level

**Lifecycle**

In [None]:
if not LOAD and DO_FIGS:
    
    fig = plt.figure()
    ax = fig.add_subplot(1,1,1)

    # a. mean by (birthyear,age)
    ys = df[df['unemployed']==False].groupby(['birthyear','age']).wage_narrow.mean()
    
    for birthyear in df.birthyear.unique():
        if np.any(df.birthyear == birthyear):
            y = ys.xs(birthyear,level='birthyear')
            x = y.index.get_level_values('age')
            ax.plot(x,y)

    # b. save
    ax.set_xlabel('age')
    ax.set_ylabel('average, 1,000 DKK')
    fig.tight_layout()
    fig.savefig(f'figs/lifecyle_DKK.pdf')
    

In [None]:
if not LOAD and DO_FIGS:
    
    fig = plt.figure()
    ax = fig.add_subplot(1,1,1)

    # a. mean by (birthyear,age)
    ys = df[df['unemployed']==False].groupby(['birthyear','age']).y.mean()
    
    for birthyear in df.birthyear.unique():
        if np.any(df.birthyear == birthyear):
            y = ys.xs(birthyear,level='birthyear')
            x = y.index.get_level_values('age')
            ax.plot(x,y)

    # b. save
    ax.set_xlabel('age')
    ax.set_ylabel('average log(income), ')
    fig.tight_layout()
    fig.savefig(f'figs/lifecyle_y.pdf')
    

In [None]:
if not LOAD and DO_FIGS:
    
    fig = plt.figure()
    ax = fig.add_subplot(1,1,1)

    # a. mean by (birthyear,age)
    ys = df[ (df['unemployed'] == False)].groupby(['birthyear','age']).y.var()
    
    for birthyear in df.birthyear.unique():
        if np.any(df.birthyear == birthyear):
            y = ys.xs(birthyear,level='birthyear')
            x = y.index.get_level_values('age')
            ax.plot(x,y)

    # b. save
    ax.set_xlabel('age')
    ax.set_ylabel('variance log(income), ')
    fig.tight_layout()
    fig.savefig(f'figs/lifecyle_y_var.pdf')
    

**Distribution, DKK**

In [None]:
if not LOAD and DO_FIGS:
    
    fig = plt.figure()
    ax = fig.add_subplot(1,1,1)

    # a. cdf
    var = df.wage_narrow
    
    # everyone
    I = ~np.isnan(var)
    x = var[I]
    x = x.sort_values()
    cdf = x.rank(method='average',pct=True)
    ax.plot(x,cdf,lw=1)

    # b. details
    ax.set_xscale('log')
    ax.set_ylim([0,1])
    ax.set_xlim([10**0,10**3])
    ax.set_xlabel('1,000 DKK')
    ax.set_ylabel('cdf')
    
    fig.tight_layout()
    fig.savefig(f'figs/wage_narrow_cdf.pdf')

**Time, log**

In [None]:
if not LOAD and DO_FIGS:
    
    fig = plt.figure()
    ax = fig.add_subplot(1,1,1)

    # a. by date
    y = df.groupby('date').y.mean()
    x = y.index.get_level_values('date')
    ax.plot(x,y)

    # b. by year
    z = y.groupby(y.index.year).transform('mean')
    for year in z.index.year.unique():
        I = z.index.year == year
        ax.plot(x[I],z[I],color=colors[1])

    # c. save
    ax.set_ylabel('average of $y_t$')
    fig.tight_layout()
    fig.savefig('figs/y_time.pdf')

## 12-month growth

**Distribution:**

In [None]:
if not LOAD and DO_FIGS:
    
    varlist = [df[f'd{k*12}y'] for k in par.ks]
    varnamelist = [f'$\Delta_{{{k*12}}}y_t$' for k in par.ks]

    datafigs.cdf(varlist,varnamelist,name='d12ky_cdf',xlabel=f'$100 \cdot \Delta_k y_t$')

**Distribution by month:**

In [None]:
if not LOAD and DO_FIGS:
    
    varlist = [df.d12y.xs(month,level='month') for month in df.index.unique('month')]
    varnamelist = [calendar.month_abbr[month] for month in df.index.unique('month')]

    datafigs.cdf(varlist,varnamelist,name = 'd12y_cdf_by_month',xlabel='$100 \cdot \Delta_{12} y_t$')

**Distribution by age-groups**

In [None]:
if not LOAD and DO_FIGS:
    
    age_groups = [(30,40),(40,50),(50,60)]
    varlist = [df.loc[df.age.between(age_group[0],age_group[1]),'d12y'] for age_group in age_groups]
    varnamelist = [f'age {age_group[0]}-{age_group[1]}' for age_group in age_groups]

    datafigs.cdf(varlist,varnamelist,name='d12y_cdf_by_age',xlabel=f'$100 \cdot \Delta_{{12}} y_t$')

**Distribution condtional on lagged growth**

In [None]:
if not LOAD and DO_FIGS:
    
    varlist = [df.d12y,df.loc[df.midrange_d12y,'d12y']]
    varnamelist = ['All',
                   f'$\Delta_{{12}} y_{{t-12}} \in [{-par.eta_cond_midrange},{par.eta_cond_midrange}]$']

    datafigs.cdf(varlist,varnamelist,name='d12y_cdf_by_midrange',xlabel=f'$100 \cdot \Delta_{{12}} y_t$')

**Distribution over life-cycle:**

In [None]:
if not LOAD and DO_FIGS:
    
    k = 1
    ylim = [-0.5,0.5]
    datafigs.lifecycle_dist(df,f'd{12*k}y',name=f'lifecycle_d{12*k}y',ylabel=f'$\Delta_{{{12*k}}} y_t$',ylim=ylim)

In [None]:
if not LOAD and DO_FIGS:
    
    k = 5
    ylim = [-1,1]
    datafigs.lifecycle_dist(df,f'd{12*k}y',name=f'lifecycle_d{12*k}y',ylabel=f'$\Delta_{{{12*k}}} y_t$',ylim=ylim)

**Time profile:**

In [None]:
if not LOAD and DO_FIGS:
    datafigs.quantiles_time(df,'d12y','date',name = 'd12y_time',ylabel='$\Delta_{{12}} y_t$')

**Month profile**

In [None]:
if not LOAD and DO_FIGS:
    datafigs.quantiles_time(df,'d12y','month',name='d12y_month',ylabel='$\Delta_{{12}} y_t$')

## 1-month growth rate

**Distribution by month:**

In [None]:
if not LOAD and DO_FIGS:
    
    varlist = [df.d1y.xs(month,level='month') for month in df.index.unique('month')]
    varnamelist = [calendar.month_abbr[month] for month in df.index.unique('month')]

    datafigs.cdf(varlist,varnamelist,name='d1y_cdf',xlabel='$100 \cdot \Delta y_t$')
    
    # selected
    varlist = [df.d1y.xs(month,level='month') for month in df.index.unique('month') if month in par.noseason_months]
    varnamelist = [calendar.month_abbr[month] for month in df.index.unique('month') if month in par.noseason_months]

    datafigs.cdf(varlist,varnamelist,name='d1y_cdf_noseason',xlabel='$100 \cdot \Delta y_t$')    

**Distribution by age-groups**

In [None]:
if not LOAD and DO_FIGS:
    
    age_groups = [(30,40),(40,50),(50,60)]
    varlist = [df.loc[df.age.between(age_group[0],age_group[1]) & df.noseason,'d1y'] for age_group in age_groups]
    varnamelist = [f'age {age_group[0]}-{age_group[1]}' for age_group in age_groups]

    datafigs.cdf(varlist,varnamelist,name='d1y_cdf_by_age',xlabel=f'$100 \cdot \Delta y_t$')

**Distribution condtional on lagged growth**

In [None]:
if not LOAD and DO_FIGS:
    
    varlist = [df.d1y,df.loc[df.midrange_d1y,'d1y']] # automatically ensures noseason
    varnamelist = ['All',
                   f'$\Delta y_{{t-1}} \in [{-par.eta_cond_midrange},{par.eta_cond_midrange}]$']

    datafigs.cdf(varlist,varnamelist,name='d1y_cdf_by_midrange',xlabel=f'$100 \cdot \Delta y_t$')

**Time profile:**

In [None]:
if not LOAD and DO_FIGS:
    datafigs.quantiles_time(df,'d1y','date',name='d1y_time',ylabel='$\Delta y_t$')

**Month profile**

In [None]:
if not LOAD and DO_FIGS:
    datafigs.quantiles_time(df,'d1y','month',name='d1y_month',ylabel='$\Delta y_t$')

**Absolute less than**

In [None]:
if not LOAD and DO_FIGS:
    use_months = [2,3,8,9,10,11]
    season_etas = [1,3,5] 
    datafigs.abs_leq_month(df,season_etas,use_months,par.ages)

# Moment functions

## Mean, variance, skewness and kurtosis

In [None]:
def _mean_var_skew_kurt(x,age,cond,par):
    
    T = par.Tnow
    N = x.size//T
    
    x = x.reshape((N,T))
    age = age.reshape((N,T))
    cond = cond.reshape((N,T))

    out = moments.mean_var_skew_kurt_ages(x,age,cond,par.ages,par.periods)
        
    return pd.Series(out)

def mean_var_skew_kurt(moms,df,cond,par,postfix=''):
    
    step = par.step

    for k in par.ks:

        # i. calculate
        ysbase = df.groupby('birthyear')['age',cond,f'd{k*step}{par.yname}']
        ys = ysbase.apply(lambda x: _mean_var_skew_kurt(x[f'd{k*step}{par.yname}'].values,x['age'].values,x[cond].values,par))

        # ii. save
        for i,mom in enumerate(['mean','var','skew','kurt']):
                    
            ys_ = ys.iloc[:,i*par.ages.size:(i+1)*par.ages.size]
            values = ys_.mean().values 
            moms[(f'{mom}_d{step}k{par.yname}{postfix}',k)] = np.nanmean(values)
            

## Autocovariances

In [None]:
def _auto_cov(a,b,offset,age,cond,par):
    
    T = par.Tnow
    N = a.size//T
    
    a = a.reshape((N,T))
    b = b.reshape((N,T))
    age = age.reshape((N,T))
    cond = cond.reshape((N,T))
                  
    out = moments.cov_ages(a,b,offset,age,cond,par.ages,par.periods)
    
    return pd.Series(out)

def auto_cov(moms,df,cond,par,postfix=''):

    step = par.step
    for l in par.ls:

        # i. calculate
        ybase = df.groupby('birthyear')['age',cond,f'd{step}{par.yname}']
        ys = ybase.apply(lambda x: _auto_cov(x[f'd{step}{par.yname}'].values,x[f'd{step}{par.yname}'].values,l*step,x['age'].values,x[cond].values,par))

        # ii. save
        values = ys.mean().values  
        moms[(f'auto_cov_d{step}{par.yname}{step}l{postfix}',l)] = np.nanmean(values)
            
def frac_auto_cov(moms,df,cond,par,postfix=''):
                  
    step = par.step
    k = step//12
    for l in range(1,12*k):
            
        # i. calculate
        ybase = df.groupby('birthyear')['age',cond,f'd{step}{par.yname}']
        ys = ybase.apply(lambda x: _auto_cov(x[f'd{step}{par.yname}'].values,x[f'd{step}{par.yname}'].values,l,x['age'].values,x[cond].values,par))

        # ii. save
        values = ys.mean().values 
        moms[(f'frac_auto_cov_d{step}{par.yname}1l{postfix}',l)] = np.nanmean(values)

## Shares

In [None]:
def _share_in_range(x,etas_low,etas_high,age,cond,par):
    
    T = par.Tnow
    N = x.size//T
    
    x = x.reshape((N,T))
    age = age.reshape((N,T))
    cond = cond.reshape((N,T))
    
    out = moments.share_in_range(x,etas_low,etas_high,age,cond,par.ages,par.periods)
    
    return pd.Series(out)  

def share_leq(moms,df,cond,par,postfix=''):
    
    step = par.step
    for k in par.ks:
        
        # i. calculate
        ybase = df.groupby('birthyear')['age',cond,f'd{step*k}{par.yname}']
        
        eta_high = np.array(par.etas_leq)
        eta_low = -np.inf*np.ones(eta_high.size)
        ys = ybase.apply(lambda x: _share_in_range(x[f'd{step*k}{par.yname}'].values,eta_low,eta_high,x['age'].values,x[cond].values,par))

        # ii. save
        for i,eta in enumerate(par.etas_leq):
            
            ys_ = ys.iloc[:,i*par.ages.size:(i+1)*par.ages.size]    
            values = ys_.mean().values   
            moms[(f'leq_d{step}k{par.yname}{postfix}',(k,eta))] = np.nanmean(values)
            
def share_cdf(moms,df,cond,par,postfix=''):
    
    step = par.step
    for k in par.ks:
        
        # i. calculate
        ybase = df.groupby('birthyear')['age',cond,f'd{step*k}{par.yname}']
        
        eta_high = np.array(par.etas_cdf)
        eta_low = -np.inf*np.ones(eta_high.size)
        ys = ybase.apply(lambda x: _share_in_range(x[f'd{step*k}{par.yname}'].values,eta_low,eta_high,x['age'].values,x[cond].values,par))

        # ii. save
        for i,eta in enumerate(par.etas_cdf):
            
            ys_ = ys.iloc[:,i*par.ages.size:(i+1)*par.ages.size]    
            values = ys_.mean().values   
            moms[(f'cdf_d{step}k{par.yname}{postfix}',(k,i))] = np.nanmean(values) 
            

## Calculate all

In [None]:
def _moms_func(moms,df,par,do_cdfs=False):

    ###############
    # a. 12-month #
    ###############
    
    par.step = 12
    par.periods = 12
    par.yname = 'y'
    par.Tnow = par.T
    
    # moments
    mean_var_skew_kurt(moms,df,'everyone',par)
    auto_cov(moms,df,'everyone',par)
    frac_auto_cov(moms,df,'everyone',par)
    
    # leq
    share_leq(moms,df,'everyone',par)
    if do_cdfs: share_cdf(moms,df,'everyone',par)
    
    # from midrange   
    postfix = '_midrange'
    share_leq(moms,df,'midrange_d12y',par,postfix=postfix)
    if do_cdfs: share_cdf(moms,df,'midrange_d12y',par,postfix=postfix)
        
    # windsorized
    par.yname = 'yw'   
    mean_var_skew_kurt(moms,df,'everyone',par)
    auto_cov(moms,df,'everyone',par)
    frac_auto_cov(moms,df,'everyone',par)      
        
    ##############
    # b. 1-month #
    ##############

    par.step = 1
    par.periods = 12
    par.yname = 'y'
    par.Tnow = par.T
    _ks = par.ks
    _ls = par.ls    
    par.ks = par.ks1
    par.ls = par.ls1
    
    # moments
    mean_var_skew_kurt(moms,df,'noseason',par)
    auto_cov(moms,df,'noseason',par)
    
    # leq
    share_leq(moms,df,'noseason',par,postfix='')
    if do_cdfs: share_cdf(moms,df,'noseason',par,postfix='')
    
    # from midrange   
    postfix = '_midrange'
    share_leq(moms,df,'midrange_d1y',par,postfix=postfix)
    if do_cdfs: share_cdf(moms,df,'midrange_d1y',par,postfix=postfix)
       
    # windsorized
    par.yname = 'yw'   
    mean_var_skew_kurt(moms,df,'noseason',par)
    auto_cov(moms,df,'noseason',par) 
    
    par.ks = _ks
    par.ls = _ls
    
    ############
    # c. level #
    ############
    
    # variances
    for k in par.ks_level:
        
        # a. condition on y_t and y_t+k non-missing
        ysbase = df[df.noseason][['birthyear','t','y',f'y_lead{12*k}']]
        I = (~np.isnan(df.y)) & (~np.isnan(df[f'y_lead{12*k}']))
        ysbase = ysbase.loc[I]
        
        # b. variance within each cohhort and t
        ys = ysbase.groupby(['birthyear','t'])['y',f'y_lead{12*k}'].var()
        ys['y_diff'] = ys[f'y_lead{12*k}'] - ys['y']
        
        moms[('var_y_d12_diff',k)] = np.nanmean(ys.y_diff)

    # covariances
    def cov(a,b):
        
        I = (~np.isnan(a)) & (~np.isnan(b))
        if I.any():
            covmat = np.cov(a[I],b[I])
            return covmat[0,1]
        else:
            return np.nan

    k = 12
    for l in par.ks_level[:-1]:
            
        # a. condition on y_t, y_t+k, and y_t+k+l non-missing
        ysbase = df[df.noseason][['birthyear','t','y',f'y_lead{k}',f'y_lead{k+12*l}']]
        I = (~np.isnan(df.y)) & (~np.isnan(df[f'y_lead{k}'])) & (~np.isnan(df[f'y_lead{k+12*l}']))
        ysbase = ysbase.loc[I]

        # b. covariance between y_t and y_t+k(+l) within each cohort and t
        ys_ = ysbase.groupby(['birthyear','t'])['y',f'y_lead{k}',f'y_lead{k+12*l}']
        ys = ys_.apply(lambda x: cov(x['y'].values,x[f'y_lead{k}'].values))
        ys = ys.to_frame(name='y')

        ys['lead'] = ys_.apply(lambda x: cov(x['y'].values,x[f'y_lead{k+12*l}'].values))
        ys['y_diff'] = ys['lead'] - ys['y']

        moms[('cov_y_y_d12_diff',l)] = np.nanmean(ys.y_diff)

In [None]:
def moms_func(df,par,do_cdfs=False):
    
    moms = OrderedDict()
    _moms_func(moms,df,par,do_cdfs=do_cdfs)
        
    # output moments as array
    cols = ['momname','args','arg1','arg2','value']   
    moms_df = pd.DataFrame(columns=cols)
 
    i = 0
    for key,value in moms.items():

        # i. name and spec
        name = key[0]
        args = key[1]
        if type(key[1]) is tuple:
            arg1,arg2 = key[1]
        else:
            arg1 = key[1]
            arg2 = ''

        # ii. value
        moms_df = moms_df.append(pd.DataFrame([[name,args,arg1,arg2,value]],columns=cols))
            
    return moms,moms_df

# Calculate moments

## Load/save

In [None]:
if not LOAD:
    %time df.to_parquet(f'data/moments_{sample}.parquet')
else:
    %time df = pd.read_parquet(f'data/moments_{sample}.parquet')

## Calculate

In [None]:
# a. calculate moments
%time moms,moms_df = moms_func(df,par,do_cdfs=True)

# d. save
moms_df.loc[:,['momname','args','value']].to_excel(f'moments/moments_{sample}.xls',index=False)

# Bootstrap

**Run bootstraps:**

In [None]:
# a. task
def bootstrap(b,sample_pnr,sample,par):

    # i. load data
    df = pd.read_parquet(f'data/moments_{sample}.parquet')
    
    # ii. sample
    _ids = list(product(sample_pnr,df.index.levels[1],df.index.levels[2]))
    _df = df.loc[_ids,:]
    
    # iii. make index unique
    _df = _df.reset_index()
    _df['pnr'] = np.repeat(np.arange(sample_pnr.size),df.index.levels[1].size*df.index.levels[2].size)
    _df = _df.set_index('pnr','year','month')

    # iv. calculate moments
    _moms,moms_df = moms_func(_df,par,do_cdfs=False)
    np.savetxt(f'moments/samples/{b}_{sample}.txt',moms_df.value.values,delimiter=',') 
    

In [None]:
if DO_BOOTSTRAP:
        
    # a. sample pnrs
    np.random.seed(2019)
    pnrs = df['age'].groupby(level='pnr').sum().index.values # hack to gets pnr's
    sample_pnrs = np.random.choice(pnrs,size=(max_boot,pnrs.size),replace=True)

    # b. run bootstraps
    tasks = (joblib.delayed(bootstrap)(b,sample_pnrs[b,:],sample,par) for b in range(min_boot,max_boot))
    %time joblib.Parallel(n_jobs=60)(tasks)
        

**Save bootstraps:**

In [None]:
if DO_BOOTSTRAP:
            
    # a. number of bootstraps and moments
    num_boot_tot = len(glob.glob(f'moments\\samples\\*_{sample}.txt'))
    Nmoms = np.loadtxt(f'moments/samples/0_{sample}.txt').size

    # b. save vectors of moments in chuncks
    chunksize = 50
    for b in range(0,num_boot_tot,chunksize):

        last = np.fmin(b+chunksize,num_boot_tot)

        mom_mat = np.empty((last-b,Nmoms))  
        for i in range(0,last-b):
            mom_mat[i,:] = np.loadtxt(f'moments/samples/{b+i}_{sample}.txt',delimiter=',')

        np.savetxt(f'moments\moments_bootstrap_{b}_{last-1}_{sample}.txt',mom_mat,delimiter=',')
        

**Peak at boostrap:**

In [None]:
# a. load
files = glob.glob(f'moments\moments_bootstrap*_{sample}.txt')
moms_boots = np.concatenate([np.genfromtxt(file,delimiter=',') for file in files],axis=0).T

# b. individual
I = ~moms_df.momname.str.contains('cdf')
for b in range(5):
    moms_df.loc[I,f'b{b}'] = moms_boots[:,b]

# c. variance
moms_df.loc[I,f'var'] = np.var(moms_boots,axis=1)
    
# d. show
moms_df.loc[I,['momname','value','b0','b1','b2','b3','b4','var']]