# Import

In [2]:
import xml.etree.ElementTree as ET
import pandas as pd
import matplotlib.pyplot as plt

# Transformation

## Use `xml.etree.ElementTree` module parse data

In [3]:
def convert_xml_to_dataframe(file_path):
    '''
    convert xml file into a dataframe format
    '''
    tree = ET.parse(file_path)
    root = tree.getroot()
    # parse scenatio dates
    date_list = []
    for date in root.iter('scenarioDates'):
        date_list.append(date.text.split())
    # dimensionality
    date_list = date_list[0]
    # parse desk names
    desk_list = []
    for desk in root.iter('value'):
        if 'DESK' in desk.text:
            desk_list.append(desk.text.split())
    # dimensionality
    desk_list = [desk_list[i][0] for i in range(len(desk_list))]
    # parst risk class names
    risk_class_list = []
    for risk_class in root.iter('pvRiskClassList'):
        risk_class_list.append(risk_class.text.split())
    # dimensionality
    risk_class_list = risk_class_list[0]
    # combine desk names with risk class names
    desk_class_list = []
    for desk in desk_list:
        for risk_class in risk_class_list:
            desk_class_list.append(str(desk) + '_' + str(risk_class))
    # parst pv value
    pv_list = []
    for pv in root.iter('pvList'):
        pv_list.append(pv.text.split())
    # create a dataframe and change data type to float
    PV = pd.DataFrame(pv_list, dtype=float).T
    PV.columns = desk_class_list
    PV = PV.sort_index(axis=1)
    PV.insert(0, 'DATE', date_list)
    return PV

# Read data

In [4]:
sp_pv = convert_xml_to_dataframe('10Y_P&L.xml')

In [5]:
sp_pv.head()

Unnamed: 0,DATE,DESK4_A_TOTAL,DESK4_B_TOTAL,DESK4_C_TOTAL,DESK4_D_TOTAL,DESK4_E_TOTAL,DESK4_F_TOTAL,DESK4_G_TOTAL,DESK4_G_TOTAL.1,DESK4_H_TOTAL,DESK4_I_TOTAL,DESK4_J_TOTAL,DESK4_K_TOTAL
0,2006-09-01,-36413.48855,39195.963364,-45139.233907,-1070008.0,-1069001.0,185054.848215,-1724339.0,808192.664841,-9308.402859,147896.872462,-7624.302597,10992.502642
1,2006-09-04,-1178.214171,4216.087228,-10280.017152,-1068437.0,-1067430.0,162375.528238,-1721056.0,830023.179129,-9555.320378,108767.072623,-7871.212,11239.428278
2,2006-09-05,19912.455767,-16719.338608,10592.403697,-1069343.0,-1068336.0,205649.216115,-1722798.0,786528.653362,-9610.516448,85441.295376,-7926.396303,11294.636115
3,2006-09-06,20027.938821,-17311.79621,10916.628181,-1069812.0,-1068805.0,181776.753119,-1723964.0,807871.263783,-9458.532889,85247.637803,-7774.438839,11142.62646
4,2006-09-07,26817.860586,-23885.911339,17681.553907,-1069897.0,-1068889.0,167246.304529,-1724250.0,819146.481139,-9503.316217,77658.255402,-7819.207852,11187.424105


# Get Stress Period

In [6]:
# get profit and lost by subtracing the next day's present value
sp_pl = sp_pv.iloc[:, 1:].diff(periods=-1)
sp_pl.insert(0, "DATE", sp_pv.DATE)
sp_pl = sp_pl[:-1]

In [7]:
sp_pl.head()

Unnamed: 0,DATE,DESK4_A_TOTAL,DESK4_B_TOTAL,DESK4_C_TOTAL,DESK4_D_TOTAL,DESK4_E_TOTAL,DESK4_F_TOTAL,DESK4_G_TOTAL,DESK4_G_TOTAL.1,DESK4_H_TOTAL,DESK4_I_TOTAL,DESK4_J_TOTAL,DESK4_K_TOTAL
0,2006-09-01,-35235.274378,34979.876136,-34859.216756,-1571.459084,-1570.45817,22679.319978,-3282.918731,-21830.514288,246.91752,39129.799838,246.909404,-246.925636
1,2006-09-04,-21090.669938,20935.425836,-20872.420849,906.724476,906.198418,-43273.687877,1742.068718,43494.525767,55.19607,23325.777248,55.184303,-55.207837
2,2006-09-05,-115.483054,592.457602,-324.224484,468.70588,468.331781,23872.462996,1165.960571,-21342.610421,-151.983559,193.657573,-151.957464,152.009654
3,2006-09-06,-6789.921765,6574.115129,-6764.925725,84.417662,84.422745,14530.44859,285.938608,-11275.217356,44.783329,7589.382401,44.769013,-44.797644
4,2006-09-07,-48141.100814,47992.28529,-48022.796045,-843.668979,-843.089423,-9521.832758,-1855.504737,-314.697334,-3.54364,53524.027676,-3.537871,3.54941


In [8]:
# sum each 250 days period profit and lost
sp_pl_sum = sp_pl.iloc[:, 1:].rolling(250).sum()
sp_pl_sum.insert(0, "DATE", sp_pl.DATE)
sp_pl_sum = sp_pl_sum[250:]
sp_pl_sum.head()

Unnamed: 0,DATE,DESK4_A_TOTAL,DESK4_B_TOTAL,DESK4_C_TOTAL,DESK4_D_TOTAL,DESK4_E_TOTAL,DESK4_F_TOTAL,DESK4_G_TOTAL,DESK4_G_TOTAL.1,DESK4_H_TOTAL,DESK4_I_TOTAL,DESK4_J_TOTAL,DESK4_K_TOTAL
250,2007-08-21,45829.832661,-43207.790284,46640.246325,192.371824,192.464354,-21114.864379,768.089721,45426.615471,308.391575,-51644.55324,308.364775,-308.418376
251,2007-08-22,41816.04706,-44067.081671,42585.047983,1173.22124,1172.604926,34203.043953,3103.848553,-20649.430778,95.796773,-47360.374952,95.787139,-95.806407
252,2007-08-23,96282.748588,-95890.862549,96172.87286,1917.903566,1916.437672,48030.35744,3735.82463,-48330.653424,-252.376179,-107476.37592,-252.344729,252.407629
253,2007-08-24,28375.535891,-28824.656148,29906.776749,2604.125362,2602.067771,-575.358281,5197.972993,-21641.521244,1010.207245,-32218.646688,1010.056111,-1010.358379
254,2007-08-27,58831.778516,-58508.982774,58318.464043,-2245.469586,-2243.864824,14286.073999,-4643.989265,-32594.96029,21.488361,-64977.554931,21.518822,-21.4579


In [9]:
# combine duplicated columns
sp_pl_sum = sp_pl_sum.groupby(sp_pl_sum.columns, axis=1).sum()
# get the minimum value list
sp_pl_sum_min = sp_pl_sum.min()[1:]
sp_pl_sum_min

DESK4_A_TOTAL    -664750
DESK4_B_TOTAL    -725398
DESK4_C_TOTAL    -660331
DESK4_D_TOTAL   -12317.2
DESK4_E_TOTAL   -12307.6
DESK4_F_TOTAL    -239251
DESK4_G_TOTAL    -294454
DESK4_H_TOTAL   -12971.9
DESK4_I_TOTAL    -809562
DESK4_J_TOTAL   -12971.3
DESK4_K_TOTAL   -10826.2
dtype: object

In [10]:
# find stress period end date
count_desk = 0
end_date_index = []
sp_end_date = []
for count_desk in range(sp_pl_sum_min.size):
    index = sp_pl_sum[sp_pl_sum[sp_pl_sum_min.index[count_desk]]== sp_pl_sum_min.values[count_desk]].index[0]
    end_date_index.append(index)
    sp_end_date.append(sp_pl_sum.iloc[index].DATE)
sp_end_date

['2010-11-22',
 '2010-03-05',
 '2010-11-22',
 '2010-08-18',
 '2010-08-18',
 '2010-11-22',
 '2009-11-11',
 '2009-01-08',
 '2010-03-05',
 '2009-01-08',
 '2009-12-24']

In [11]:
# find stress period start date
sp_start_date = []
for i in range(len(end_date_index)):
    sp_start_date.append(sp_pl_sum.iloc[end_date_index[i]-250].DATE)
sp_start_date

['2009-12-03',
 '2009-03-18',
 '2009-12-03',
 '2009-08-31',
 '2009-08-31',
 '2009-12-03',
 '2008-11-24',
 '2008-01-22',
 '2009-03-18',
 '2008-01-22',
 '2009-01-08']

In [12]:
# store into dataframe
stress_period = pd.DataFrame({'Desk' : sp_pl_sum_min.index, 'start_date' : sp_start_date, 'end_date' : sp_end_date})

In [13]:
stress_period

Unnamed: 0,Desk,start_date,end_date
0,DESK4_A_TOTAL,2009-12-03,2010-11-22
1,DESK4_B_TOTAL,2009-03-18,2010-03-05
2,DESK4_C_TOTAL,2009-12-03,2010-11-22
3,DESK4_D_TOTAL,2009-08-31,2010-08-18
4,DESK4_E_TOTAL,2009-08-31,2010-08-18
5,DESK4_F_TOTAL,2009-12-03,2010-11-22
6,DESK4_G_TOTAL,2008-11-24,2009-11-11
7,DESK4_H_TOTAL,2008-01-22,2009-01-08
8,DESK4_I_TOTAL,2009-03-18,2010-03-05
9,DESK4_J_TOTAL,2008-01-22,2009-01-08
