## Importing Data

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from imblearn.over_sampling import SMOTE  # Oversampling Technique

from functions import * 

In [None]:
# Dropping Features
bank_df.drop(columns=["Loan_ID", "Customer_ID"], inplace=True)

## Balancing Dependent Variable
Synthetic Minority Oversampling Technique (SMOTE)

In [None]:


# First, look at your initial value counts
print(y.value_counts())

# Start your SMOTE instance
smote = SMOTE()

# Apply SMOTE to your data, some previously defined X and y
X_resampled, y_resampled = smote.fit_resample(X, y) 

# Look at your new, resampled value counts - should be equal!
print(pd.Series(y_resampled).value_counts())

## Dropping Irregular values

In [4]:
irregular_values = bank_df.loc[bank_df.Current_Loan_Amount == 99_999_999.0].index  # This high value might be represented as a NaN

bank_df.drop(index=irregular_values, inplace=True)

## Encoding
Performing One-hot-encoding to keep interpretability

In [5]:
ohe = OneHotEncoder(drop="first")

In [6]:
# Getting data types that are objects
cat_var = bank_df.select_dtypes(include="object")
num_var = bank_df.select_dtypes(exclude="object")

# Reseting Index
cat_var.reset_index(drop=True, inplace=True)
num_var.reset_index(drop=True, inplace=True)

In [7]:
# One hot encoding categorical variables
array_to_df = ohe.fit_transform(bank_df[cat_var.columns.tolist()]).toarray()  # Array values of the transformed columns

encoded = pd.DataFrame(array_to_df, columns=ohe.get_feature_names(cat_var.columns))  # Creating a pandas dataframe

bank_df = num_var.join(encoded, how="left")  # Combining the categorical variables and the numeric variables

In [8]:
bank_df.head()

Unnamed: 0,Current_Loan_Amount,Credit_Score,Annual_Income,Monthly_Debt,Years_of_Credit_History,Months_since_last_delinquent,Number_of_Open_Accounts,Number_of_Credit_Problems,Current_Credit_Balance,Maximum_Open_Credit,...,Purpose_Medical Bills,Purpose_Other,Purpose_Take a Trip,Purpose_major_purchase,Purpose_moving,Purpose_other,Purpose_renewable_energy,Purpose_small_business,Purpose_vacation,Purpose_wedding
0,217646.0,730.0,1184194.0,10855.08,19.6,10.0,13.0,1.0,122170.0,272052.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,548746.0,678.0,2559110.0,18660.28,22.6,33.0,4.0,0.0,437171.0,555038.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,234124.0,727.0,693234.0,14211.24,24.7,46.0,10.0,1.0,28291.0,107052.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,666204.0,723.0,1821967.0,17612.24,22.0,34.0,15.0,0.0,813694.0,2004618.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,317108.0,687.0,1133274.0,9632.81,17.4,53.0,4.0,0.0,60287.0,126940.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# Checking missing values
bank_df.isna().sum().sum()

0

In [12]:
bank_df.shape

(31505, 42)

In [13]:
bank_df.columns

Index(['Current_Loan_Amount', 'Credit_Score', 'Annual_Income', 'Monthly_Debt',
       'Years_of_Credit_History', 'Months_since_last_delinquent',
       'Number_of_Open_Accounts', 'Number_of_Credit_Problems',
       'Current_Credit_Balance', 'Maximum_Open_Credit', 'Bankruptcies',
       'Tax_Liens', 'Loan_Status_Fully Paid', 'Term_Short Term',
       'Years_in_current_job_10+ years', 'Years_in_current_job_2 years',
       'Years_in_current_job_3 years', 'Years_in_current_job_4 years',
       'Years_in_current_job_5 years', 'Years_in_current_job_6 years',
       'Years_in_current_job_7 years', 'Years_in_current_job_8 years',
       'Years_in_current_job_9 years', 'Years_in_current_job_< 1 year',
       'Home_Ownership_Home Mortgage', 'Home_Ownership_Own Home',
       'Home_Ownership_Rent', 'Purpose_Buy House', 'Purpose_Buy a Car',
       'Purpose_Debt Consolidation', 'Purpose_Educational Expenses',
       'Purpose_Home Improvements', 'Purpose_Medical Bills', 'Purpose_Other',
       'Purp

## Training and Testing Sets

In [15]:
y = bank_df["Loan_Status_Fully Paid"]
X = bank_df.drop(columns=["Loan_Status_Fully Paid"])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3, test_size=0.2)

In [16]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((25204, 41), (6301, 41), (25204,), (6301,))

In [17]:
X_train.head()

Unnamed: 0,Current_Loan_Amount,Credit_Score,Annual_Income,Monthly_Debt,Years_of_Credit_History,Months_since_last_delinquent,Number_of_Open_Accounts,Number_of_Credit_Problems,Current_Credit_Balance,Maximum_Open_Credit,...,Purpose_Medical Bills,Purpose_Other,Purpose_Take a Trip,Purpose_major_purchase,Purpose_moving,Purpose_other,Purpose_renewable_energy,Purpose_small_business,Purpose_vacation,Purpose_wedding
17360,669768.0,734.0,3856240.0,14750.08,32.7,48.0,8.0,0.0,331968.0,591360.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12543,198242.0,734.0,951140.0,22272.37,23.5,17.0,11.0,0.0,367403.0,743754.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14419,291918.0,747.0,799121.0,10921.2,13.7,41.0,12.0,0.0,167523.0,629816.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8131,348018.0,726.0,2254236.0,59173.79,25.0,11.0,12.0,0.0,584953.0,781220.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26397,228448.0,7350.0,1721438.0,40166.95,21.5,34.0,15.0,0.0,278749.0,461120.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
X_train.shape

(25204, 41)