In [1]:
import numpy as np
import pandas as pd
import saspy 
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer

from lifelines import KaplanMeierFitter
from lifelines import NelsonAalenFitter

# Import Data

In [2]:
data = pd.read_excel(r'Mortgage.xlsx')

# Data Description

In [3]:
data.head()

Unnamed: 0,event,ID,default,Prepayment,vintage,cred_score,DBT_RATIO,start_date,end_date,event_time
0,2,1201510415699000,0,1,2005,757.0,0.43,2005-12-01,2006-01-01,31
1,2,1201510419849000,0,1,2005,757.0,0.44,2005-12-01,2006-01-01,31
2,2,1201517819379000,0,1,2005,784.0,0.36,2005-12-01,2006-01-01,31
3,2,1201518749199000,0,1,2005,618.0,0.3,2005-12-01,2006-01-01,31
4,2,1201519183129000,0,1,2005,730.0,0.25,2005-12-01,2006-01-01,31


In [4]:
data.columns

Index(['event', 'ID', 'default', 'Prepayment', 'vintage', 'cred_score',
       'DBT_RATIO', 'start_date', 'end_date', 'event_time'],
      dtype='object')

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1023 entries, 0 to 1022
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   event       1023 non-null   int64         
 1   ID          1023 non-null   int64         
 2   default     1023 non-null   int64         
 3   Prepayment  1023 non-null   int64         
 4   vintage     1023 non-null   int64         
 5   cred_score  1007 non-null   float64       
 6   DBT_RATIO   1020 non-null   float64       
 7   start_date  1023 non-null   datetime64[ns]
 8   end_date    1023 non-null   datetime64[ns]
 9   event_time  1023 non-null   int64         
dtypes: datetime64[ns](2), float64(2), int64(6)
memory usage: 80.0 KB


In [6]:
#Drop values that are not used in this model. 
df1 = data.drop(['default', 'Prepayment', "ID"], axis=1)

In [7]:
df1.head()

Unnamed: 0,event,vintage,cred_score,DBT_RATIO,start_date,end_date,event_time
0,2,2005,757.0,0.43,2005-12-01,2006-01-01,31
1,2,2005,757.0,0.44,2005-12-01,2006-01-01,31
2,2,2005,784.0,0.36,2005-12-01,2006-01-01,31
3,2,2005,618.0,0.3,2005-12-01,2006-01-01,31
4,2,2005,730.0,0.25,2005-12-01,2006-01-01,31


In [8]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1023 entries, 0 to 1022
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   event       1023 non-null   int64         
 1   vintage     1023 non-null   int64         
 2   cred_score  1007 non-null   float64       
 3   DBT_RATIO   1020 non-null   float64       
 4   start_date  1023 non-null   datetime64[ns]
 5   end_date    1023 non-null   datetime64[ns]
 6   event_time  1023 non-null   int64         
dtypes: datetime64[ns](2), float64(2), int64(3)
memory usage: 56.1 KB


# Data Preprocessing

In [9]:
#Check for missing values

df1.isnull().sum()

event          0
vintage        0
cred_score    16
DBT_RATIO      3
start_date     0
end_date       0
event_time     0
dtype: int64

In [10]:
#imputting missing values
for i in df1.columns:
    if df1[i].dtype =="float64":
        df1[i].fillna(df1[i].mean(), inplace=True)
    else:
        df1[i].fillna(df1[i].mode(), inplace=True)   

In [11]:
df1.isnull().sum()

event         0
vintage       0
cred_score    0
DBT_RATIO     0
start_date    0
end_date      0
event_time    0
dtype: int64

In [12]:
for i in df1.columns:
    if df1[i].dtype == 'object':
        df1[i] = df1[i].str.decode('utf-8')

In [13]:
for i in range (df1.shape[1]-1):
    print(df1.columns[i])
    print(df1.iloc[:,i].value_counts())
    print("\n")

event
2    605
0    246
1    172
Name: event, dtype: int64


vintage
2005    196
2006    180
2004    153
2007    149
2003     91
2008     55
2002     44
2001     35
2000     24
2009     23
1998     15
1999     13
1996     10
2010      9
2011      8
2012      7
1997      4
1994      4
1991      2
1995      1
Name: vintage, dtype: int64


cred_score
723.878848    16
718.000000    14
801.000000    13
775.000000    12
802.000000    12
              ..
624.000000     1
580.000000     1
583.000000     1
633.000000     1
563.000000     1
Name: cred_score, Length: 217, dtype: int64


DBT_RATIO
0.24    42
0.25    36
0.30    36
0.29    36
0.37    33
        ..
0.59     1
0.70     1
0.64     1
0.58     1
1.08     1
Name: DBT_RATIO, Length: 71, dtype: int64


start_date
2005-12-01    586
2007-05-01     25
2006-08-01     20
2006-06-01     20
2007-08-01     20
             ... 
2009-02-01      1
2010-06-01      1
2012-09-01      1
2009-08-01      1
2009-10-01      1
Name: start_date, Length: 62, dty

In [14]:
#This is where I would normally make dummy values for characters but there are none in this dataset. 

In [15]:
df1.head()

Unnamed: 0,event,vintage,cred_score,DBT_RATIO,start_date,end_date,event_time
0,2,2005,757.0,0.43,2005-12-01,2006-01-01,31
1,2,2005,757.0,0.44,2005-12-01,2006-01-01,31
2,2,2005,784.0,0.36,2005-12-01,2006-01-01,31
3,2,2005,618.0,0.3,2005-12-01,2006-01-01,31
4,2,2005,730.0,0.25,2005-12-01,2006-01-01,31


# Survival Modeling

In [16]:
###### This program would not fully run on my kernel. 
#km = KaplanMeierFitter()
#km.fit(df1.event_time, df1.event, label = "Kaplein Meier Estimate")

# Hazard Modeling

In [1]:
#### This program would not fully run on my kernel. 
#hf = NelsonAalenFitter()

#hf.fit(df1.event_time, event_observed = df1.event)