# **<center>Preprocessing**

In [47]:
import os
from dotenv import load_dotenv

import pandas as pd
import numpy as np

import datetime as dt
from tqdm import tqdm

In [48]:
# Load in the dotenv variables
load_dotenv()

# Grab the project path from the dotenv file without the addons
project_path = os.getenv('Project_Path')[2:78]

# Change notebook directory back one so that it can acess the data
os.chdir(project_path)

# Show all the columns in the .head() method
pd.set_option('display.max_columns', None)

In [49]:
# Load the cleaned data from data wrangling
data = pd.read_csv('./data/interim/wrangled', low_memory = False)
# Load in the loan_status column for use
loan = pd.read_csv('./data/raw/loan.csv',low_memory = False)
# Load the original dataset to get the accurate representation
y = loan['loan_status']

In [50]:
data['completed'] = np.where(y.isin(['Fully Paid', 'Charged Off',
                                     'Does not meet the credit policy. Status:Fully Paid',
                                     'Does not meet the credit policy. Status:Charged Off']), 1, 0)
loan['completed'] = data['completed']

In [51]:
# replace the string values with numerical values, use a dictionary to be concise
loan['term'].replace({' 36 months':3, ' 60 months':5}, inplace = True)

In [66]:
finished = data.loc[data['completed'] == 1, :'addr_state_VA'].copy()
active = data.loc[data['completed'] == 0, :'addr_state_VA'].copy()

finished_loan = loan.loc[loan['completed'] == 1, :].copy()
active_loan = loan.loc[loan['completed'] == 0, :].copy()

In [53]:
active.head()

Unnamed: 0,loan_amnt,term,grade,annual_inc,pymnt_plan,inq_last_6mths,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,last_pymnt_amnt,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,dti_joint,tot_cur_bal,open_acc_6m,il_util,max_bal_bc,inq_last_12m,emp_type_Accountant,emp_type_Admin,emp_type_Analyst,emp_type_Assistant,emp_type_Clergy,emp_type_Clerk,emp_type_Designer,emp_type_Director,emp_type_Education,emp_type_Executive,emp_type_Healer,emp_type_Manager,emp_type_Operator,emp_type_Technical,emp_type_Vol,home_ownership_OTHER,home_ownership_RENT,verification_status_Source Verified,verification_status_Verified,purpose_credit_card,purpose_major_purchase,purpose_other,purpose_small_business,purpose_wedding,addr_state_AL,addr_state_CO,addr_state_DC,addr_state_FL,addr_state_IL,addr_state_KS,addr_state_ME,addr_state_MS,addr_state_ND,addr_state_NE,addr_state_NH,addr_state_NV,addr_state_NY,addr_state_SC,addr_state_TX,addr_state_VA
4,-1.393555,1.52736,0.301854,0.07639,-0.003357,-0.695687,-0.335522,0.484341,-0.047627,1.075293,-0.97077,-0.437216,0.302329,-0.107149,0.576897,-0.023957,-0.843347,-0.103891,-0.139209,-0.116273,-0.088306,-0.109205,-0.104843,-0.157505,-0.147428,-0.039637,-0.077757,-0.052789,-0.196448,-0.186479,-0.180873,-0.245189,-0.435078,-0.169413,2.531605,-0.172517,-0.016276,1.2214,1.301013,-0.698657,-0.55016,-0.140912,4.437084,-0.108777,-0.051496,-0.113061,-0.147149,-0.052423,-0.271536,-0.204067,-0.094934,-0.024331,-0.065744,-0.02324,-0.036428,-0.069732,-0.119254,-0.301818,-0.110158,-0.295217,-0.174612
6,-0.919366,1.52736,-0.468204,-0.433518,-0.003357,0.305877,-0.335522,0.035904,1.28076,-1.204874,-0.97077,-0.415883,0.302329,-0.107149,0.576897,-0.023957,-0.843347,-0.103891,-0.139209,-0.116273,-0.088306,-0.109205,-0.104843,-0.157505,-0.147428,-0.039637,-0.077757,-0.052789,-0.196448,-0.186479,-0.180873,4.078482,-0.435078,-0.169413,-0.395006,-0.172517,-0.016276,1.2214,-0.768632,-0.698657,-0.55016,-0.140912,-0.225373,-0.108777,-0.051496,-0.113061,-0.147149,-0.052423,-0.271536,-0.204067,-0.094934,-0.024331,-0.065744,-0.02324,-0.036428,-0.069732,-0.119254,-0.301818,-0.110158,-0.295217,-0.174612
32,-0.563724,1.52736,-0.468204,-0.709891,-0.003357,0.305877,-0.335522,-0.034012,0.358851,0.484138,-0.97077,-0.400681,0.302329,-0.107149,0.576897,-0.023957,-0.843347,-0.103891,-0.139209,-0.116273,-0.088306,-0.109205,-0.104843,-0.157505,-0.147428,-0.039637,-0.077757,-0.052789,-0.196448,-0.186479,-0.180873,-0.245189,-0.435078,-0.169413,-0.395006,-0.172517,-0.016276,1.2214,-0.768632,1.431317,-0.55016,-0.140912,-0.225373,-0.108777,-0.051496,-0.113061,-0.147149,-0.052423,3.682759,-0.204067,-0.094934,-0.024331,-0.065744,-0.02324,-0.036428,-0.069732,-0.119254,-0.301818,-0.110158,-0.295217,-0.174612
39,-0.267356,1.52736,0.301854,-0.742653,-0.003357,2.309004,-0.335522,-0.302218,-0.211056,-0.107016,-0.97077,-0.392449,0.302329,-0.107149,0.576897,-0.023957,-0.843347,-0.103891,-0.139209,-0.116273,-0.088306,-0.109205,-0.104843,-0.157505,-0.147428,-0.039637,-0.077757,-0.052789,-0.196448,-0.186479,-0.180873,-0.245189,-0.435078,-0.169413,2.531605,-0.172517,-0.016276,1.2214,-0.768632,1.431317,-0.55016,-0.140912,-0.225373,-0.108777,-0.051496,-0.113061,-0.147149,-0.052423,-0.271536,4.900357,-0.094934,-0.024331,-0.065744,-0.02324,-0.036428,-0.069732,-0.119254,-0.301818,-0.110158,-0.295217,-0.174612
61,0.325381,1.52736,-0.930239,-0.541755,-0.003357,0.305877,-0.335522,-0.276312,0.161898,-1.627127,-0.97077,-0.404691,0.302329,-0.107149,0.576897,-0.023957,-0.843347,-0.103891,-0.139209,-0.116273,-0.088306,-0.109205,-0.104843,-0.157505,-0.147428,-0.039637,-0.077757,-0.052789,-0.196448,-0.186479,-0.180873,-0.245189,-0.435078,-0.169413,-0.395006,-0.172517,-0.016276,1.2214,-0.768632,1.431317,-0.55016,-0.140912,-0.225373,9.193151,-0.051496,-0.113061,-0.147149,-0.052423,-0.271536,-0.204067,-0.094934,-0.024331,-0.065744,-0.02324,-0.036428,-0.069732,-0.119254,-0.301818,-0.110158,-0.295217,-0.174612


In [67]:
finished_loan['int_rate_usable'] = finished_loan['int_rate'] / 100 + 1

finished['$_expected'] = finished_loan['funded_amnt'] * finished_loan['int_rate'] * finished_loan['term']

finished['$_actual'] = finished_loan[['total_pymnt', 'total_rec_late_fee', 
                                      'recoveries']].sum(axis = 1) 

finished['frac'] = finished['$_actual'] / finished['$_expected']

finished.drop(['$_expected','$_actual'], axis = 1, inplace = True)

In [77]:
active_loan['int_rate_usable'] = active_loan['int_rate'] / 100 + 1

active['$_expected'] = active_loan['funded_amnt'] * active_loan['int_rate'] * active_loan['term'] - active_loan['total_pymnt']

active.drop('int_rate_usable', axis = 1, inplace = True)

In [81]:
finished.to_csv('./data/processed/train.csv', index = False)

active.to_csv('./data/processed/predict.csv', index = False)