# Import Libraries and Data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import re

import numpy as np
from sklearn import linear_model
from functools import reduce
from sklearn.metrics import confusion_matrix

In [None]:
DATA_PATH = '../data'

In [None]:
cons = pd.read_parquet(f'{DATA_PATH}/raw/q2_consDF_final.pqt')
acct = pd.read_parquet(f'{DATA_PATH}/raw/q2_acctDF_final.pqt')
inflows = pd.read_parquet(f'{DATA_PATH}/raw/q2_inflows_final.pqt')
outflows = pd.read_parquet(f'{DATA_PATH}/raw/q2_outflows_final.pqt')

# Create Features

In [None]:
features = []

### Max Balance at time of evaluation

In [None]:
balance_var = acct[['prism_consumer_id','balance','balance_date']].groupby('prism_consumer_id').agg({
    'balance':['min', 'max', 'std'],
    'balance_date':['max']
}).reset_index()
balance_var.fillna(0.0, inplace=True)

balance_var.columns = balance_var.columns.to_flat_index().map(lambda x: x[1] + x[0])
features.append(balance_var)
balance_var.head()

### Disposable Income

In [None]:
cons_eval_dates = cons.sort_values('evaluation_date')[['prism_consumer_id','evaluation_date']]
outflow_merged = pd.merge(outflows, cons_eval_dates, on="prism_consumer_id", how="left")
inflow_merged = pd.merge(inflows, cons_eval_dates, on="prism_consumer_id", how="left")
outflow_merged['amount'] = outflow_merged['amount'] * -1
total_balance = pd.concat([inflow_merged, outflow_merged])

total_balance = total_balance[total_balance['posted_date'] <= total_balance['evaluation_date']]

total_balance = total_balance[[
        'prism_consumer_id',
        'amount',
        'posted_date',
]].groupby('prism_consumer_id').agg({'amount':['sum'],'posted_date':['min','max']})

total_balance['date_range'] = pd.to_timedelta(total_balance['posted_date','max'] - total_balance['posted_date','min']).dt.days / 365
total_balance = pd.DataFrame({
    'total_balance': total_balance['amount','sum'] / total_balance['date_range'], 
    'total_balance_date': total_balance['posted_date', 'max']
}).reset_index()

features.append(total_balance)
total_balance.head()

In [None]:
outflow_valid = outflow_merged[outflow_merged['posted_date'] <= outflow_merged['evaluation_date']]
outflow_counts = outflow_valid.groupby('prism_consumer_id').agg({
    'amount':['count'],
    'posted_date':['min','max'],
}).reset_index()

outflow_counts['date_range'] = pd.to_timedelta(outflow_counts['posted_date','max'] - outflow_counts['posted_date','min']).dt.days / 365 * 12

outflow_counts = pd.DataFrame({
    'prism_consumer_id': outflow_counts['prism_consumer_id'],
    'num_monthly_purchase': outflow_counts['amount','count'] / outflow_counts['date_range'],
})

outflow_counts.replace([np.inf, -np.inf], 0.0, inplace=True)
features.append(outflow_counts)
outflow_counts.head()

**Savings Feature - count of how many times someone has pulled from savings account**

In [None]:
transfer_from_savings = inflows[inflows['category_description']=='SELF_TRANSFER']
transfer_from_savings = transfer_from_savings[transfer_from_savings['memo_clean'].str.contains('Savings')]
count_tfs = transfer_from_savings.groupby('prism_consumer_id').count().reset_index()
inflow_ids = pd.merge(inflows[['prism_consumer_id']], count_tfs, on='prism_consumer_id', how='left')
inflow_ids = inflow_ids.fillna(0).drop_duplicates(subset=['prism_consumer_id']).reset_index()[['prism_consumer_id', 'memo_clean']]
inflow_ids

### Merge all features into feature matrix

In [None]:
feature_df = reduce(
    lambda left,right: pd.merge(left,right, on='prism_consumer_id', how='outer'), 
    features
)

feature_dates = re.findall(r"\w+_date", str(list(feature_df.columns)))
feature_df['feature_date'] = feature_df[feature_dates].max(axis=1)
feature_df.drop(feature_dates, axis=1, inplace=True)
feature_df.fillna(0.0, inplace=True)
feature_df

# Prototype Model to Predict Default Risk

In [None]:
model = linear_model.LogisticRegression()

In [None]:
sorted_cons = cons.sort_values('evaluation_date')

dropped_cols = ['prism_consumer_id', 'evaluation_date', 'feature_date']
feature_matrix = pd.merge(sorted_cons, feature_df, on='prism_consumer_id', how='left')

# Make sure no invalid training data is pulled
assert np.mean(feature_matrix['evaluation_date'] < feature_matrix['feature_date']) == 0, "Features pulled from dates after evaluation_date"
feature_matrix.drop(dropped_cols, axis=1, inplace=True)

# Train Test Split
TRAIN_SPLIT = 0.75
train = feature_matrix.iloc[:int(feature_matrix.shape[0] * TRAIN_SPLIT)].drop("APPROVED", axis=1)
test = feature_matrix.iloc[int(feature_matrix.shape[0] * TRAIN_SPLIT):].drop("APPROVED", axis=1)

X_train = train.iloc[:,1:]
y_train = train.iloc[:,0]
X_test = test.iloc[:,1:]
y_test = test.iloc[:,0]

In [None]:
model.fit(X_train, y_train)

In [None]:
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

cm_train = confusion_matrix(y_train, train_pred)
cm_test = confusion_matrix(y_test, test_pred)

tn_train, fp_train, fn_train, tp_train = cm_train.ravel()
tn_test, fp_test, fn_test, tp_test = cm_test.ravel()

In [None]:
print(f"Training Accuracy: {np.mean(model.predict(X_train) == y_train):.4f}")
print(f"Testing Accuracy: {np.mean(model.predict(X_test) == y_test):.4f}")
print("Coefficients: \n", model.coef_, "\n")

print(f"tn_train: {tn_train}, fp_train: {fp_train}, fn_train: {fn_train}, tp_train: {tp_train}")
print(f"tn_test:  {tn_test},  fp_test:  {fp_test},  fn_test:  {fn_test}, tp_test:  {tp_test}")