# Composite Indicators: Minimal Pipeline

## 1 Basic settings

### 1.1 Load libraries

In [None]:
import os
from cif import cif
import pandas as pd
import re
import datetime
import warnings
from IPython.display import Image

### 1.2 Check availability of X-13ARIMA-SEATS model

The model can be downloaded from https://www.census.gov/srd/www/x13as/ and its directory needs to be added to the system variables.

In [None]:
print(os.environ['X13PATH'])

### 1.3 Settings

Change the country of interest and other default settings here. For the complete list of available country codes run

```python
cif.getOECDJSONStructure(dsname = 'MEI', showValues = [0])
```

In [None]:
country = 'CZE' # Select target country

#os.chdir('C:/path/') # Set path to to folder, where the plots and logs should be saved (optional)

bw = False # True for black and white visualisations

saveData = True # Save the original data sets if True

### 1.4 Output directory

In [None]:
strDate = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")

outputDir = os.path.join('plots_' + country + '_' + strDate)
os.makedirs(outputDir, exist_ok = True)

## 2 Data Load

Loading data from OECD API.

In [None]:
data_all, subjects_all, measures_all = cif.createDataFrameFromOECD(countries = [country], dsname = 'MEI', frequency = 'M')
data_rs, subjects_rs, measures_rs = cif.createDataFrameFromOECD(countries = [country], dsname = 'QNA', subject = ['B1_GE'], frequency = 'Q')

print('Downloaded MEI data set size: %d x %d' % (data_all.shape[0], data_all.shape[1]))
print('Downloaded reference data set size: %d x %d' % (data_rs.shape[0], data_rs.shape[1]))

In [None]:
# Save the data

if saveData:

    data_all.to_csv(os.path.join(outputDir, 'data_all.csv'))
    subjects_all.to_csv(os.path.join(outputDir, 'subjects_all.csv'))
    measures_all.to_csv(os.path.join(outputDir, 'measures_all.csv'))
    data_rs.to_csv(os.path.join(outputDir, 'data_rs.csv'))
    subjects_rs.to_csv(os.path.join(outputDir, 'subjects_rs.csv'))
    measures_rs.to_csv(os.path.join(outputDir, 'measures_rs.csv'))

In [None]:
data_all.tail(12) # MEI database data from last year

In [None]:
# Leading indicators: Component series

colMultiInd = data_all.columns.names.index('subject')

ind_LOCO = subjects_all['id'].apply(lambda x: re.search(r'\bLOCO', x) != None)
subjects_LOCO = subjects_all[ind_LOCO]


# Leading indicators: Reference series

ind_LORS = subjects_all['id'].apply(lambda x: re.search(r'\bLORS', x) != None)
subjects_LORS = subjects_all[ind_LORS]


# Leading indicators: CLI

ind_LOLI = subjects_all['id'].apply(lambda x: re.search(r'\bLOLI', x) != None)
subjects_LOLI = subjects_all[ind_LOLI]


# Candidate time series

subjects_adj = subjects_all[-(ind_LOCO | ind_LORS | ind_LOLI)]
data_adj = data_all.loc[ : , [x for x in data_all.columns if x[colMultiInd] in list(subjects_adj['id'])]].copy()

## 3 Data Transformations

### 3.1 Reference Series

#### 3.1.1 Priority list of reference series (GDP) and frequency conversion

In [None]:
rsPriorityList = [ 'LNBQRSA' # Best fit with OECD reference series
                , 'CQR'
                , 'LNBQR'
                , 'DNBSA'
                , 'DOBSA'
                , 'CQRSA'
                , 'CARSA'
                , 'GPSA'
                , 'GYSA'
                , 'CPCARSA'
                , 'VIXOBSA'
                , 'VOBARSA'
                , 'VPVOBARSA'
                , 'HCPCARSA'
                , 'HVPVOBARSA'
                ]

if (data_rs.shape[0] > 0):
    
    rsq = cif.getOnlyBestMeasure(df = data_rs, priorityList = rsPriorityList)
    rsq = cif.getRidOfMultiindex(df = rsq)
    rsq = cif.renameQuarterlyIndex(df = rsq)
    rsq = cif.getIndexAsDate(df = rsq)
    rs = cif.createMonthlySeries(df = rsq)
    rs.dropna(inplace = True)

In [None]:
data_rs.tail(4) # all available measures of the reference series (last year, quaterly series)

In [None]:
rs.tail(12) # selected measure of the reference series (last year, monthly series)

#### 3.1.2 Seasonal adjustment, outlier filtering and short-term prediction & Cycle identification (Hodrick-Prescott filter) & Normalisation

In [None]:
fileLogs = open(os.path.join(outputDir, country + '_fileLogs_rsTransformation.txt'), 'w')
rs_SA_HP_norm = cif.pipelineTransformations(rs, showPlots = False, savePlots = outputDir, saveLogs = fileLogs)
fileLogs.close()

### 3.2 Individual indicators

#### 3.2.1 Priority list of OECD available measures

In [None]:
priorityList = ['NCML'
                , 'ML'
                , 'CXML'
                , 'ST'
                , 'NCCU'
                , 'CXCU'
                , 'IXOB'
                , 'NCMLSA'
                , 'MLSA'
                , 'CXMLSA'
                , 'STSA'
                , 'NCCUSA'
                , 'CXCUSA'
                , 'IXOBSA'
                , 'IXNSA'
                , 'GP'
                , 'GY']

if data_adj.shape[0] > 0:
    
    data = cif.getOnlyBestMeasure(df = data_adj, priorityList = priorityList)
    data = cif.getRidOfMultiindex(df = data)
    data = cif.getIndexAsDate(data)

#### 3.2.2 Seasonal adjustment, outlier filtering and short-term prediction & Cycle identification (Hodrick-Prescott filter) & Normalisation

In [None]:
with warnings.catch_warnings():
            
    warnings.simplefilter("ignore")
            
    fileLogs = open(os.path.join(outputDir, 'fileLogs_dataTransformation.txt'), 'w')
    data_SA_HP_norm = cif.pipelineTransformations(df = data, showPlots = False, savePlots = outputDir, saveLogs = fileLogs, createInverse = True) 
    fileLogs.close()

## 4 Turning-point detection (Bry-Boschan algorithm)

### 4.1 Reference series

In [None]:
fileLogs = open(os.path.join(outputDir, country + '_fileLogs_rsEvaluation.txt'), 'w')
rs_ind_turningPoints = cif.pipelineTPDetection(df = rs_SA_HP_norm, printDetails = False, showPlots = False, savePlots = outputDir, saveLogs = fileLogs)
fileLogs.close()

In [None]:
Image(os.path.join(outputDir, country + '_B1_GE_LNBQRSA' + '_05_ext.png'), width = 600) # change name of the series here

## 4.2 Individual indicators

In [None]:
fileLogs = open(os.path.join(outputDir, 'fileLogs_dataEvaluation.txt'), 'w')
data_ind_turningPoints = cif.pipelineTPDetection(df = data_SA_HP_norm, origColumns = list(data.columns), printDetails = False, showPlots = False, savePlots = outputDir, saveLogs = fileLogs)
fileLogs.close()

In [None]:
Image(os.path.join(outputDir, country + '_XTEXVA01_NCML' + '_05_ext.png'), width = 600) # change name of the series here

## 5 Turning-points matching

In [None]:
fileLogs = open(os.path.join(outputDir, country + '_fileLogs_tpMatching.txt'), 'w')
data_ind_extOrd, data_ind_time, data_ind_missing, data_ind_missingEarly, data_ind_extra = cif.pipelineTPMatching(df1 = rs_SA_HP_norm, df2 = data_SA_HP_norm, ind1 = rs_ind_turningPoints, ind2 = data_ind_turningPoints, printDetails = False, showPlots = False, savePlots = outputDir, saveLogs = fileLogs, nameSuffix = '_06_matching' + '_rs' + country)
fileLogs.close()

In [None]:
Image(os.path.join(outputDir, country + '_XTEXVA01_NCML' + '_06_matching_rs' + country + '.png'), width = 600) # change name of the series here

## 6 Evaluation

In [None]:
data_totalEval, data_selectedEval, data_selectedCol = cif.pipelineEvaluation(df1 = rs_SA_HP_norm, df2 = data_SA_HP_norm, missing = data_ind_missing, missingEarly = data_ind_missingEarly, extra = data_ind_extra, time = data_ind_time, maxInd = 15)

In [None]:
data_selectedEval

## 7 Aggregation & final evaluation 

### 7.1 CLI construction

In [None]:
agg_cMat = data_SA_HP_norm.loc[:, data_selectedCol] # value of the de-trended, smoothed and normalised component

CLI = cif.pipelineCreateCLI(agg_cMat).rename(columns = {'CLI': country + '_CLI'})

In [None]:
cif.compareTwoSeries(CLI, rs_SA_HP_norm)

### 7.2 CLI turning points

In [None]:
fileLogs = open(os.path.join(outputDir, country + '_fileLogs_CLIEvaluation.txt'), 'w')
CLI_ind_turningPoints = cif.pipelineTPDetection(CLI, printDetails = False, showPlots = False, savePlots = outputDir, saveLogs = fileLogs)
fileLogs.close()

### 7.3 Match turning points

In [None]:
CLI_ind_extOrd, CLI_ind_time, CLI_ind_missing, CLI_ind_missingEarly, CLI_ind_extra = cif.pipelineTPMatching(df1 = rs_SA_HP_norm, df2 = CLI, ind1 = rs_ind_turningPoints, ind2 = CLI_ind_turningPoints, showPlots = False, savePlots = outputDir, nameSuffix = '_06_matching' + '_rs' + country, bw = bw)

In [None]:
Image(os.path.join(outputDir, country + '_CLI' + '_06_matching_rs' + country + '.png'), width = 600) # change name of the series here

### 7.4 Basic characteristics of created CLI

In [None]:
CLI_eval = cif.pipelineEvaluation(df1 = rs_SA_HP_norm, df2 = CLI, missing = CLI_ind_missing, missingEarly = CLI_ind_missingEarly, extra = CLI_ind_extra, time = CLI_ind_time, evalOnly = True)

In [None]:
CLI_eval