In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [None]:
# Split data into features (X) and target (y)
X = data_scaled.drop('Loan_Status', axis=1)  # Features
y = data_scaled['Loan_Status']  # Target variable

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [None]:
# Apply SMOTE to the training set only
smote = SMOTE(sampling_strategy='minority', random_state=0)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [None]:
print("Train set Approved loan: ", (y_train == 1).sum())
print("Train set Rejected loan: ", (y_train == 0).sum())
print("Test set Approved loan: ", (y_test == 1).sum())
print("Test set Rejected loan: ", (y_test == 0).sum())

In [None]:
print("Train set: ", len(y_train))
print("Test set: ", len(y_test))

### Applying log transformation to normalize data and reduce skewness in data

In [None]:
# Copy for comparison
train_data_old = train_data.copy()

In [None]:
train_data["ApplicantIncome"] = np.log1p(train_data["ApplicantIncome"])
train_data["CoapplicantIncome"] = np.log1p(train_data["CoapplicantIncome"])
train_data["LoanAmount"] = np.log1p(train_data["LoanAmount"])


test_data["ApplicantIncome"] = np.log1p(test_data["ApplicantIncome"])
test_data["CoapplicantIncome"] = np.log1p(test_data["CoapplicantIncome"])
test_data["LoanAmount"] = np.log1p(test_data["LoanAmount"])

In [None]:
# Plot histograms before and after transformation
fig, ax = plt.subplots(1, 2, figsize=(12, 6))

# Original data
ax[0].hist(train_data_old['ApplicantIncome'], bins=30, color='blue', alpha=0.7)
ax[0].set_title('Original ApplicantIncome')

# After log transformation
ax[1].hist(train_data['ApplicantIncome'], bins=30, color='green', alpha=0.7)
ax[1].set_title('Log Transformed ApplicantIncome')

plt.show()

In [None]:
# Plot histograms before and after transformation
fig, ax = plt.subplots(1, 2, figsize=(12, 6))

# Original data
ax[0].hist(train_data_old['CoapplicantIncome'], bins=30, color='blue', alpha=0.7)
ax[0].set_title('Original CoapplicantIncome')

# After log transformation
ax[1].hist(train_data['CoapplicantIncome'], bins=30, color='green', alpha=0.7)
ax[1].set_title('Log Transformed CoapplicantIncome')

plt.show()

In [None]:
# Plot histograms before and after transformation
fig, ax = plt.subplots(1, 2, figsize=(12, 6))

# Original data
ax[0].hist(train_data_old['LoanAmount'], bins=30, color='blue', alpha=0.7)
ax[0].set_title('Original LoanAmount')

# After log transformation
ax[1].hist(train_data['LoanAmount'], bins=30, color='green', alpha=0.7)
ax[1].set_title('Log Transformed LoanAmount')

plt.show()

In [None]:
scaler = MinMaxScaler()

# Fit the scaler on the data and transform it
data_scaled = scaler.fit_transform(train_data)

data_scaled = pd.DataFrame(data_scaled, columns=train_data.columns)

data_scaled.head()

In [None]:
print("Approved loan: ", len(data_scaled[data_scaled["Loan_Status"] == 1.]))
print("Rejected loan: ", len(data_scaled[data_scaled["Loan_Status"] == 0.]))

In [None]:
# Count Loan_Status values
loan_status_counts = data_scaled['Loan_Status'].value_counts()

# Extract data for the pie chart
labels = loan_status_counts.index  # Categories (Y, N)
sizes = loan_status_counts.values  # Count of each category

# Plot the pie chart
plt.figure(figsize=(6, 6))  # Adjust size of the figure
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, colors=['lightblue', 'lightcoral'])
plt.title('Loan Status Distribution')
plt.show()

The dataset is heavily imabalance where most of the data is approved loan. According to the paper, the author uses 2 data augmentation technique, SMOTE, and another method which include training a simple machine learning model using the available data. After that, they utilized user-selected data, which closely resemble the available data, to evaluate the model and predict the corresponding class labels.

However, since the author failed to mention what machine learning model they used to trained on the available data and how many user-selected data they used, we will only be using SMOTE to augment the data after splitting the data.