-
Notifications
You must be signed in to change notification settings - Fork 0
/
finance_a.py
127 lines (116 loc) · 5.01 KB
/
finance_a.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# -*- coding: utf-8 -*-
"""finance_a
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1AQ4WwZ4VbB8ujS2JFmK4EOg0foZP7JRx
"""
import pandas as pd
import numpy as np
import sys
# link to file on disk or in web
#link_dataset = 'https://drive.google.com/uc?export=download&id=12msJkESl8zEo6J1eogm03K2HJrGwwb7e'
#1ogGzNovZw1l9Q4PBD60TbqbRykiyC5Ek
link_dataset = 'https://drive.google.com/uc?export=download&id=1ogGzNovZw1l9Q4PBD60TbqbRykiyC5Ek'
# dates to filter
low_date = '2017-04-01'
high_date = '2018-09-30'
#check dates for validity
try:
pd.to_datetime(low_date)
pd.to_datetime(high_date)
except ValueError:
print('Date isn\'t valid, please check it, exit')
sys.exit()
# set format
pd.options.display.float_format = '{:,.3f}'.format
# load to dataframe
data = pd.read_csv(link_dataset, parse_dates=['FirstDisbursementDate','CutOffDate','ODdate'], infer_datetime_format=True)
# if dataframe is empty then date range isn't valid, exit
if data.empty:
print('Dataframe is empty after filtering, exit')
sys.exit()
# filter dataframe
data = data.query('(CutOffDate >= @low_date) & (CutOffDate <= @high_date)')
# set stage
data.loc[data["NoOfArrearDays"] <= 30,'Stage'] = 1
data.loc[(data["NoOfArrearDays"] > 30) & (data["NoOfArrearDays"] <= 90),'Stage'] = 2
data.loc[data["NoOfArrearDays"] > 90,'Stage'] = 3
# sort by ID and cutoff date
data.sort_values(by=['AccountID','CutOffDate'], inplace=True)
# fill stage and arrears in 3 month
data[['Next_stage','Next_arrears']] = data[['Stage','NoOfArrearDays']].shift(periods=-1, fill_value=1 )
# mask last cutoff_date for each ID
last_date = data.AccountID.ne(data.AccountID.shift(periods=-1, fill_value=1))
# mask ID that are paid already
paid = (data['CutOffDate'] < data['CutOffDate'].max()) & last_date
# set stage and arrears in 3 month according to masks
data['Next_stage'].mask(last_date, data['Stage'], inplace=True)
data['Next_arrears'].mask(last_date, data['NoOfArrearDays'], inplace=True)
data['Next_stage'].mask(paid, 0, inplace=True)
data['Next_arrears'].mask(paid, 0, inplace=True)
# set pds and lgd
# masks
mask_pds1 = (data['Stage']==1) & (data['Next_stage']==3)
mask_pds2 = (data['Stage']==2) & (data['Next_stage']==3)
mask_lgd = (data['Stage']==3) & (data['Next_stage']!=3)
# 0 by default
data['PDS1_in_3M'] = 0
data['PDS2_in_3M'] = 0
data['LGD_in_3M'] = 0
# apply mask
data.loc[mask_pds1, 'PDS1_in_3M'] = 1
data.loc[mask_pds2, 'PDS2_in_3M'] = 1
data.loc[mask_lgd, 'LGD_in_3M'] = 1
# lgd in 6 month shift
data['OutstandingPrincipal_in6M'] = data['OutstandingPrincipal'].shift(periods=-2, fill_value=1 )
data['OutstandingPrincipal_in6M'].mask(data['Stage']!=3, 0,inplace=True)
# mask last cutoff_date for each ID
another_acc = data.AccountID.ne(data.AccountID.shift(periods=-2, fill_value=1))
data['OutstandingPrincipal_in6M'].mask(another_acc, 0, inplace=True)
# using groupby, create tables with count of Stages and sum of PDS
counts = data.groupby(['CutOffDate','Stage']).count()
last_2_date = data.groupby(['CutOffDate']).sum().index[-2]
data['OutstandingPrincipal_in6M'].mask(another_acc & (data['CutOffDate']>=last_2_date)
& (data['Stage']==3), data['OutstandingPrincipal'], inplace=True)
#still3 = data.query('(Stage == 3) & (Next_stage == 3)')
#counts_still3 = still3.groupby(['CutOffDate']).count()
sums = data.groupby(['CutOffDate']).sum()
counts.reset_index(inplace=True)
# calc rates in result table
results_pd = pd.DataFrame({
'CutOffDate': sums.index.date,
'PDS_1_in_3M': sums['PDS1_in_3M'],
'St_1_count': counts.loc[counts['Stage']==1, 'AccountID'].values,
'PDS_2_in_3M': sums['PDS2_in_3M'],
'St_2_count': counts.loc[counts['Stage']==2, 'AccountID'].values})
# add rates
results_pd.eval(''' PDS_1_rate = PDS_1_in_3M / St_1_count
PDS_2_rate = PDS_2_in_3M / St_2_count''', inplace=True)
# final rates
pd1 = results_pd[:-1]['PDS_1_rate'].mean()
pd2 = results_pd[:-1]['PDS_2_rate'].mean()
results_pd = results_pd.append({'PDS_1_rate': results_pd[:-1]['PDS_1_rate'].mean(),
'PDS_2_rate':results_pd[:-1]['PDS_2_rate'].mean()},
ignore_index=True)
# swap columns
results_pd = results_pd[['CutOffDate','PDS_1_in_3M','St_1_count',
'PDS_1_rate','PDS_1_in_3M','St_1_count','PDS_2_rate']]
# fill NaN
results_pd.fillna('', inplace=True)
# save to file
results_pd.to_csv('result_A_pd.csv')
# lgd
st3_princ = data.loc[data['Stage']==3].groupby(['CutOffDate']).sum()
st3_princ = st3_princ['OutstandingPrincipal'].values
results_lgd = pd.DataFrame({
'CutOffDate': sums.index.date,
'PAR90OutstIn6M': sums['OutstandingPrincipal_in6M'],
'PAR90Outstoday': st3_princ }, index=None)
results_lgd.eval('QTRLGDPAR90Rec3M = PAR90OutstIn6M / PAR90Outstoday',inplace=True)
results_lgd = results_lgd.append({
'QTRLGDPAR90Rec3M': results_lgd[:-2]['QTRLGDPAR90Rec3M'].mean()}, ignore_index=True)
results_lgd.fillna('', inplace=True)
# save
results_lgd.to_csv('result_A_lgd.csv')
# print
print(results_pd,'\n\n',results_lgd)