# Import Libraries and Data

In [172]:
import pandas as pd
import matplotlib.pyplot as plt
import re

import numpy as np
from sklearn import linear_model
from functools import reduce
from sklearn.metrics import confusion_matrix

In [173]:
DATA_PATH = '../data'

In [174]:
cons = pd.read_parquet(f'{DATA_PATH}/raw/q2_consDF_final.pqt')
acct = pd.read_parquet(f'{DATA_PATH}/raw/q2_acctDF_final.pqt')
inflows = pd.read_parquet(f'{DATA_PATH}/raw/q2_inflows_final.pqt')
outflows = pd.read_parquet(f'{DATA_PATH}/raw/q2_outflows_final.pqt')

# Create Features

### Max Balance at time of evaluation

In [175]:
balance_var = acct[['prism_consumer_id','balance','balance_date']].groupby('prism_consumer_id').agg({
    'balance':['min', 'max', 'std'],
    'balance_date':['max']
}).reset_index()
balance_var.fillna(0.0, inplace=True)

balance_var.columns = balance_var.columns.to_flat_index().map(lambda x: x[1] + x[0])
balance_var.head()


Unnamed: 0,prism_consumer_id,minbalance,maxbalance,stdbalance,maxbalance_date
0,0,6182.6,9907.23,2633.71113,2023-04-13
1,2,8079.43,17426.83,6609.609926,2022-02-15
2,4,0.0,4871.39,2644.903088,2021-08-13
3,7,517.95,7071.18,3766.231217,2021-08-08
4,9,13348.76,13348.76,0.0,2023-04-19


### Disposable Income

In [176]:
cons_eval_dates = cons.sort_values('evaluation_date')[['prism_consumer_id','evaluation_date']]
outflow_merged = pd.merge(outflows, cons_eval_dates, on="prism_consumer_id", how="left")
inflow_merged = pd.merge(inflows, cons_eval_dates, on="prism_consumer_id", how="left")
outflow_merged['amount'] = outflow_merged['amount'] * -1
total_balance = pd.concat([inflow_merged, outflow_merged])

total_balance = total_balance[total_balance['posted_date'] <= total_balance['evaluation_date']]

total_balance = total_balance[[
        'prism_consumer_id',
        'amount',
        'posted_date',
]].groupby('prism_consumer_id').agg({'amount':['sum'],'posted_date':['min','max']})

total_balance['date_range'] = pd.to_timedelta(total_balance['posted_date','max'] - total_balance['posted_date','min']).dt.days / 365
total_balance = pd.DataFrame({
    'total_balance': total_balance['amount','sum'] / total_balance['date_range'], 
    'total_balance_date': total_balance['posted_date', 'max']
}).reset_index()
total_balance.head()

Unnamed: 0,prism_consumer_id,total_balance,total_balance_date
0,0,15154.285773,2023-04-12
1,2,1546.065942,2022-02-14
2,4,3377.081389,2021-08-12
3,7,7311.445429,2021-08-06
4,9,5873.815289,2023-04-18


### Merge all features into feature matrix

In [177]:
features = [balance_var, total_balance]
feature_df = reduce(
    lambda left,right: pd.merge(left,right, on='prism_consumer_id', how='outer'), 
    features
)

feature_dates = re.findall(r"\w+_date", str(list(feature_df.columns)))
feature_df['feature_date'] = feature_df[feature_dates].max(axis=1)
feature_df.drop(feature_dates, axis=1, inplace=True)
feature_df.fillna(0.0, inplace=True)
feature_df

Unnamed: 0,prism_consumer_id,minbalance,maxbalance,stdbalance,total_balance,feature_date
0,0,6182.60,9907.23,2633.711130,15154.285773,2023-04-13
1,2,8079.43,17426.83,6609.609926,1546.065942,2022-02-15
2,4,0.00,4871.39,2644.903088,3377.081389,2021-08-13
3,7,517.95,7071.18,3766.231217,7311.445429,2021-08-08
4,9,13348.76,13348.76,0.000000,5873.815289,2023-04-19
...,...,...,...,...,...,...
2973,5941,2058.81,2058.81,0.000000,-1816.084426,2023-01-06
2974,5943,2431.44,2431.44,0.000000,4121.415493,2023-02-03
2975,5944,45.74,45.74,0.000000,-765.389130,2023-02-01
2976,5945,59.88,59.88,0.000000,-1557.370000,2023-02-02


# Prototype Model to Predict Default Risk

In [178]:
model = linear_model.LogisticRegression()

In [179]:
sorted_cons = cons.sort_values('evaluation_date')

dropped_cols = ['prism_consumer_id', 'evaluation_date', 'feature_date']
feature_matrix = pd.merge(sorted_cons, feature_df, on='prism_consumer_id', how='left')

# Make sure no invalid training data is pulled
assert np.mean(feature_matrix['evaluation_date'] < feature_matrix['feature_date']) == 0, "Features pulled from dates after evaluation_date"
feature_matrix.drop(dropped_cols, axis=1, inplace=True)

# Train Test Split
TRAIN_SPLIT = 0.75
train = feature_matrix.iloc[:int(feature_matrix.shape[0] * TRAIN_SPLIT)].drop("APPROVED", axis=1)
test = feature_matrix.iloc[int(feature_matrix.shape[0] * TRAIN_SPLIT):].drop("APPROVED", axis=1)

X_train = train.iloc[:,1:]
y_train = train.iloc[:,0]
X_test = test.iloc[:,1:]
y_test = test.iloc[:,0]

In [180]:
model.fit(X_train, y_train)

In [181]:
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

cm_train = confusion_matrix(y_train, train_pred)
cm_test = confusion_matrix(y_test, test_pred)

tn_train, fp_train, fn_train, tp_train = cm_train.ravel()
tn_test, fp_test, fn_test, tp_test = cm_test.ravel()

In [182]:
print(f"Training Accuracy: {np.mean(model.predict(X_train) == y_train):.4f}")
print(f"Testing Accuracy: {np.mean(model.predict(X_test) == y_test):.4f}")
print("Coefficients: \n", model.coef_, "\n")

print(f"tn_train: {tn_train}, fp_train: {fp_train}, fn_train: {fn_train}, tp_train: {tp_train}")
print(f"tn_test:  {tn_test},  fp_test:  {fp_test},  fn_test:  {fn_test}, tp_test:  {tp_test}")

Training Accuracy: 0.8236
Testing Accuracy: 0.7597
Coefficients: 
 [[-4.20768713e-03  3.40624629e-03 -1.62264953e-02  3.04907013e-05]] 

tn_train: 1838, fp_train: 10, fn_train: 384, tp_train: 1
tn_test:  564,  fp_test:  5,  fn_test:  174, tp_test:  2
