In [17]:
from scipy.io import arff
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder


## Basic Data Exploration

In [18]:
data = pd.read_csv('./datasets_kaggle/loan-10k/loan-10k.lrn.csv')

# Display the first few rows of the dataset
# print(data.head())

# Check for missing values
missing_values = data.isnull().sum()
# Filter out features with missing values
missing_features = missing_values[missing_values > 0]
print("Features with missing values:")
print(missing_features)

# Check for distinct values
features_info = []

for column in data.columns:
    unique_values = data[column].unique()  # Get the unique values in the column
    num_unique_values = len(unique_values)  # Count the number of unique values
    if num_unique_values < 30:
        features_info.append({'Features': column,
                              'Num_Unique_Values': num_unique_values,
                              'Unique_Values': unique_values})

# Convert the list into a DataFrame
features_info_df = pd.DataFrame(features_info)

# Sort the DataFrame by 'Num_Unique_Values' in ascending order
features_info_df = features_info_df.sort_values(by='Num_Unique_Values')

features_info_df.to_csv('features_unique_val.csv', index=False)

# Print the DataFrame
print(features_info_df)


Features with missing values:
Series([], dtype: int64)
                      Features  Num_Unique_Values  \
12                 policy_code                  1   
0                         term                  2   
13            application_type                  2   
10         initial_list_status                  2   
30         disbursement_method                  2   
31        debt_settlement_flag                  2   
29               hardship_flag                  2   
5                   pymnt_plan                  2   
23            num_tl_120dpd_2m                  2   
3          verification_status                  3   
14              acc_now_delinq                  3   
24                num_tl_30dpd                  3   
16    chargeoff_within_12_mths                  4   
11  collections_12_mths_ex_med                  4   
2               home_ownership                  5   
4                  loan_status                  6   
27        pub_rec_bankruptcies              

## Preprocessing

In [21]:
#Label encode specific features

feature_mappings = {
    'term': {'36 months': 0, '60 months': 1},
    'application_type': {'Individual': 0, 'Joint App': 1},
    'verification_status': {'Not Verified': 0, 'Verified': 1, 'Source Verified': 2},
    'home_ownership': {'RENT': 0, 'MORTGAGE': 1, 'OTHER': 2, 'ANY': 3, 'OWN': 4},
    'loan_status': {'Current': 5, 'Fully Paid': 4, 'Charged Off': 3, 'Late (31-120 days)': 2, 'In Grace Period': 1, 'Late (16-30 days)': 0},
    'grade': {'A': 6, 'E': 5, 'D': 4, 'C': 3, 'B': 2, 'G': 1, 'F': 0},
    'emp_length': {'10+ years': 10, '< 1 year': 0, '1 year': 1, '2 years': 2, '3 years': 3, '4 years': 4, '5 years': 5, '6 years': 6, '7 years': 7, '8 years': 8, '9 years': 9},
}

one_hot_encoding_limit = 10

label_encoder = LabelEncoder()
ordinal_encoder = OrdinalEncoder()
one_hot_encoder = OneHotEncoder()

for feature in data.columns:
    if data[feature].dtype == 'object':  # Check if the feature contains categorical data
        unique_values = data[feature].nunique()  # Count the number of unique values in the feature

        # Drop any feature which only has 1 distinct value (adds no information)
        if unique_values == 1:
            data.drop(feature)
            continue

        # If feature mapping is defined, use ordinal encoder
        if feature in feature_mappings:
            mapping = feature_mappings[feature]
            # data[feature] = ordinal_encoder(categories=[sorted(mapping.keys())], dtype=int)
            data[feature] = data[feature].map(mapping)
        else: 
            if unique_values <= one_hot_encoding_limit:
                encoded_features = one_hot_encoder.fit_transform(data[[feature]])
                encoded_features_df = pd.DataFrame(encoded_features.toarray(), columns=one_hot_encoder.get_feature_names_out([feature]))
                data = pd.concat([data, encoded_features_df], axis=1)
                data.drop([feature], axis=1, inplace=True)
            else:
                data[feature] = label_encoder.fit_transform(data[feature])

data.to_csv('encoded_data.csv', index=False)