In [13]:
from scipy.io import arff
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.preprocessing import LabelEncoder


## Basic Data Exploration

In [23]:
data = pd.read_csv('./datasets_kaggle/loan-10k/loan-10k.lrn.csv')

# Display the first few rows of the dataset
# print(data.head())

# Check for missing values
missing_values = data.isnull().sum()
# Filter out columns with missing values
missing_columns = missing_values[missing_values > 0]
print("Columns with missing values:")
print(missing_columns)

# Check for distinct values
columns_info = []

# Iterate over each column
for column in data.columns:
    num_unique_values = data[column].nunique()  # Count the number of unique values in the column
    if num_unique_values < 500:
        columns_info.append((column, num_unique_values))

columns_info_df = pd.DataFrame(columns_info, columns=['Column', 'Num_Unique_Values'])

# Sort the DataFrame by 'Num_Unique_Values' in ascending order
columns_info_df = columns_info_df.sort_values(by='Num_Unique_Values')

# Print the list
print(columns_info_df)


Columns with missing values:
Series([], dtype: int64)
                  Column  Num_Unique_Values
21           policy_code                  1
0                   term                  2
22      application_type                  2
43      num_tl_120dpd_2m                  2
16   initial_list_status                  2
..                   ...                ...
31  mths_since_recent_bc                197
47        pct_tl_nvr_dlq                291
17    total_rec_late_fee                299
27    mo_sin_old_il_acct                326
1               int_rate                338

[63 rows x 2 columns]


## Preprocessing

In [14]:
# Label encode every column which only has 2 distinct values to 0/1
for column in data.columns:
    if data[column].dtype == 'object':  # Check if the column contains categorical data
        unique_values = data[column].nunique()  # Count the number of unique values in the column
        if unique_values == 2:  # If there are only two unique values, convert to binary data
            labelEncoder = LabelEncoder()
            data[column] = labelEncoder.fit_transform(data[column])

print(data.head())

      ID  loan_amnt  funded_amnt  funded_amnt_inv  term  int_rate  \
0  24341    12500.0      12500.0          12500.0     0      7.21   
1  67534    33850.0      33850.0          33775.0     1     20.99   
2  35080    10000.0      10000.0          10000.0     1     20.00   
3   4828    20250.0      20250.0          20250.0     0     14.31   
4  59259    25000.0      25000.0          25000.0     0     14.99   

   installment emp_length home_ownership  annual_inc  ...  \
0       387.17   < 1 year       MORTGAGE     81000.0  ...   
1       915.57     1 year       MORTGAGE     80000.0  ...   
2       264.94   < 1 year           RENT     36580.0  ...   
3       695.15    9 years           RENT     48700.0  ...   
4       866.52     1 year       MORTGAGE     85000.0  ...   

  debt_settlement_flag issue_d_month  issue_d_year earliest_cr_line_month  \
0                    0             6          2018                      6   
1                    0            10          2015              