# Lending Club - Loan Approval Process Optimization

# 3. Preprocessing

## 3.1 Imports

In [1]:
# Import the libraries necessary for the current task
import pandas as pd
import numpy as np
from numpy import nan
import matplotlib.pyplot as plt
import seaborn as sns
import os

# pandas Configuration
pd.set_option("max_rows", 120)
pd.set_option("max_columns", 120)
pd.set_option("display.max_colwidth", None)

## 3.2 Data

In [2]:
# Load the CSV data
LC_data = pd.read_csv(r"C:\Users\lastr\Desktop\GitHub\Lending_Club_Capstone\data\lending_club_loans_EDA.csv")

In [3]:
# Summary of the data
LC_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39239 entries, 0 to 39238
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   loan_amnt             39239 non-null  float64
 1   term                  39239 non-null  object 
 2   installment           39239 non-null  float64
 3   grade                 39239 non-null  int64  
 4   emp_length            39239 non-null  int64  
 5   home_ownership        39239 non-null  object 
 6   annual_inc            39239 non-null  float64
 7   verification_status   39239 non-null  object 
 8   loan_type             39239 non-null  int64  
 9   purpose               39239 non-null  object 
 10  addr_state            39239 non-null  object 
 11  dti                   39239 non-null  float64
 12  delinq_2yrs           39239 non-null  float64
 13  earliest_cr_line      39239 non-null  object 
 14  inq_last_6mths        39239 non-null  float64
 15  open_acc           

## 3.3 One-Hot Encoding

In [4]:
# Create dummy variables for nominal features: term, home_ownership, verification_status, purpose
nominal_features = ["term", "home_ownership", "verification_status", "purpose"]
dummies_df = pd.get_dummies(LC_data[nominal_features])
LC_data = pd.concat([LC_data, dummies_df], axis=1)
LC_data = LC_data.drop(nominal_features, axis=1)

In [5]:
# Check the first 5 entries of the modified data frame
LC_data.head()

Unnamed: 0,loan_amnt,installment,grade,emp_length,annual_inc,loan_type,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,last_credit_pull_d,pub_rec_bankruptcies,fico_range_avg,term_ 36 months,term_ 60 months,home_ownership_MORTGAGE,home_ownership_NONE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,verification_status_Not Verified,verification_status_Source Verified,verification_status_Verified,purpose_car,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding
0,5000.0,162.87,1,10,24000.0,1,AZ,27.65,0.0,Jan-1985,1.0,3.0,0.0,13648.0,83.7,9.0,Sep-2016,0.0,737.0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
1,2500.0,59.83,2,0,30000.0,0,GA,1.0,0.0,Apr-1999,5.0,3.0,0.0,1687.0,9.4,4.0,Sep-2016,0.0,742.0,0,1,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2400.0,84.33,2,10,12252.0,1,IL,8.72,0.0,Nov-2001,2.0,2.0,0.0,2956.0,98.5,10.0,Sep-2016,0.0,737.0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,10000.0,339.31,2,10,49200.0,1,CA,20.0,0.0,Feb-1996,1.0,10.0,0.0,5598.0,21.0,37.0,Apr-2016,0.0,692.0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,5000.0,156.46,0,3,36000.0,1,AZ,11.2,0.0,Nov-2004,3.0,9.0,0.0,7963.0,28.3,12.0,Jan-2016,0.0,732.0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [6]:
# Summary of the modified data frame
LC_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39239 entries, 0 to 39238
Data columns (total 43 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   loan_amnt                            39239 non-null  float64
 1   installment                          39239 non-null  float64
 2   grade                                39239 non-null  int64  
 3   emp_length                           39239 non-null  int64  
 4   annual_inc                           39239 non-null  float64
 5   loan_type                            39239 non-null  int64  
 6   addr_state                           39239 non-null  object 
 7   dti                                  39239 non-null  float64
 8   delinq_2yrs                          39239 non-null  float64
 9   earliest_cr_line                     39239 non-null  object 
 10  inq_last_6mths                       39239 non-null  float64
 11  open_acc                    

## 3.4 Save Data

In [7]:
data_path = r"C:\Users\lastr\Desktop\GitHub\Lending_Club_Capstone\data"
data_path_lending_club_loans = os.path.join(data_path, "lending_club_loans_preprocessed.csv")

if not os.path.exists(data_path_lending_club_loans):
    LC_data.to_csv(data_path_lending_club_loans, index=False)