In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
import time
import warnings
warnings.filterwarnings("ignore")

### Importing Data (and timing how long it takes):

In [2]:
t0 = time.time()
df = pd.read_csv('lending-club-loan-data/df_clean.csv')
df.drop('Unnamed: 0',axis=1,inplace=True)
t1 = time.time()
print('Cell Runtime: {}'.format(t1-t0))

Cell Runtime: 21.775784015655518


### Defining X and y:

In [3]:
y = df['loan_status']
X = df.drop('loan_status', axis=1)

### Scaling and Encoding: 

### Sklearn's OneHotEncoder, RobustScaler (due to the presence of many outliers), and OrdinalEncoder are used to transform the data:

In [4]:
t0 = time.time()
ohe = OneHotEncoder()
ordenc = OrdinalEncoder()
rs = RobustScaler()

df_obj = df.select_dtypes(exclude='number')
df_num = df.select_dtypes(include='number')
df_num.drop('loan_status',axis=1,inplace=True)

numeric_columns = list(df_num.columns)
ord_columns = ['term', 'grade', 'sub_grade', 'earliest_cr_line']
cat_columns = list(df_obj.drop(ord_columns, axis=1).columns)
numeric_ord_columns = numeric_columns + ord_columns

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', ohe, cat_columns),
        ('ord', ordenc, ord_columns)])

X_tran = preprocessor.fit_transform(X)
cat_names = preprocessor.named_transformers_['cat'].categories_
cat_names = [val for sublist in cat_names for val in sublist]
column_names = cat_names + ord_columns
X_tran = pd.DataFrame.sparse.from_spmatrix(X_tran, columns=column_names)
cols_to_drop = list(X_tran.columns)[:-4]
X_cat = X_tran[cols_to_drop]
X_tran.drop(cols_to_drop, axis=1, inplace=True)
X_tran = pd.concat([df_num, X_tran], axis=1)
X_tran = rs.fit_transform(X_tran)
X_tran = pd.DataFrame(X_tran, columns = numeric_ord_columns)
X_tran = pd.concat([X_tran, X_cat], axis=1)
t1 = time.time()
print('Cell Runtime: {}'.format(t1-t0))

Cell Runtime: 178.07948112487793


In [8]:
df_modeling = pd.concat([y, X_tran], axis=1)

In [9]:
t0=time.time()
df_modeling.to_csv('lending-club-loan-data/df_modeling.csv')
t1=time.time()
print('Cell Runtime: {}'.format(t1-t0))

Cell Runtime: 690.744145154953
