# Imports

In [1]:
# Basics
import pandas as pd  # for data manipulation and analysis
import numpy as np   # for numerical operations

# Sklearn
from sklearn.preprocessing import LabelEncoder # Label Encoder
from sklearn.model_selection import train_test_split # Split the data into training/validation sets
from sklearn.preprocessing import StandardScaler # to scale the features

# Load Data

First thing is to load in the data and take a quick peek.

In [2]:
# Load the training dataset from the specified path
# low_memory=False ensures proper type inference for large files
df_train = pd.read_csv(
    "C:/Users/barbe/OneDrive/URI DS Program/566 Advanced Topics in Machine Learning/Project/neural-credit-risk-explainer/data/train.csv",
    low_memory=False
)

We set `low_memory=False` since a column was found to have mixed types.
- This tells pandas to read the file in chunks and infer types more accurately
- This usually resolves the warning without changing anything else.

In [3]:
df_train.head()  # display the first 5 rows of the dataset

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,_,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.94496,,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good
2,0x1604,CUS_0xd40,March,Aaron Maashoh,-500,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521264648,Low_spent_Medium_value_payments,331.2098628537912,Good
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.4580743910713,Low_spent_Small_value_payments,223.45130972736783,Good
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177,Good


Note:
- ID, Customer_ID, and Name are unique identifiers or personal information, which isn't useful for modeling. These will likely be dropped in the cleaning step. Similarly for SSN.
- We can see that one row has -500 for age, which clearly can't be. We'll need to clean this column and possiblt filter out extreme values.
- Some rows are shown to have missing values. Well inspect this more and determine a course of action.
- Credit_Mix has a values of `_`, which could be an error.
- Payment_of_Min_Amount is "Yes"/"No" — perfect for binary encoding.
- Payment_Behaviour has long string patterns — may be too granular for a baseline model.

# Initial Inspection

Next, we do our initial inspection of the dataset.

In [4]:
df_train.info()  # shows column names, non-null counts, and data types

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 28 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ID                        100000 non-null  object 
 1   Customer_ID               100000 non-null  object 
 2   Month                     100000 non-null  object 
 3   Name                      90015 non-null   object 
 4   Age                       100000 non-null  object 
 5   SSN                       100000 non-null  object 
 6   Occupation                100000 non-null  object 
 7   Annual_Income             100000 non-null  object 
 8   Monthly_Inhand_Salary     84998 non-null   float64
 9   Num_Bank_Accounts         100000 non-null  int64  
 10  Num_Credit_Card           100000 non-null  int64  
 11  Interest_Rate             100000 non-null  int64  
 12  Num_of_Loan               100000 non-null  object 
 13  Type_of_Loan              88592 non-null   ob

Note:
- We have 28 columns comprised of object, int64, and float64 dtypes.
- 20 variables are objects, though many look like they should be numeric.
- We can already see that some variables have missing values.

We'll need to convert object-type accordingly. Missing values can be imputed or dropped. Categorical variables will need to be encoded.

In [5]:
df_train.describe()  # summary stats

Unnamed: 0,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Delay_from_due_date,Num_Credit_Inquiries,Credit_Utilization_Ratio,Total_EMI_per_month
count,84998.0,100000.0,100000.0,100000.0,100000.0,98035.0,100000.0,100000.0
mean,4194.17085,17.09128,22.47443,72.46604,21.06878,27.754251,32.285173,1403.118217
std,3183.686167,117.404834,129.05741,466.422621,14.860104,193.177339,5.116875,8306.04127
min,303.645417,-1.0,0.0,1.0,-5.0,0.0,20.0,0.0
25%,1625.568229,3.0,4.0,8.0,10.0,3.0,28.052567,30.30666
50%,3093.745,6.0,5.0,13.0,18.0,6.0,32.305784,69.249473
75%,5957.448333,7.0,7.0,20.0,28.0,9.0,36.496663,161.224249
max,15204.633333,1798.0,1499.0,5797.0,67.0,2597.0,50.0,82331.0


Note:
- We see evidence of some extreme outliers.
- Negative values in Num_Bank_Accounts and Delay_from_due_date are likely invalid
- Monthly_Inhand_Salary has a wide range and missing values

We'll have to handle the outliers, either through clipping or filtering. Negative values in columns that should be non-negative will need correction or removal. Scaling will be essential due to wide value ranges.

In [6]:
df_train.describe(include='object')  # summary stats for non-numeric columns (unique values, top value, frequency)

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Num_of_Loan,Type_of_Loan,Num_of_Delayed_Payment,Changed_Credit_Limit,Credit_Mix,Outstanding_Debt,Credit_History_Age,Payment_of_Min_Amount,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
count,100000,100000,100000,90015,100000,100000,100000,100000.0,100000,88592,92998,100000,100000,100000.0,90970,100000,95521,100000,98800,100000
unique,100000,12500,8,10139,1788,12501,16,18940.0,434,6260,749,4384,4,13178.0,404,3,91049,7,98792,3
top,0x1602,CUS_0xd40,January,Langep,38,#F%$D@*&8,_______,36585.12,3,Not Specified,19,_,Standard,1360.45,15 Years and 11 Months,Yes,__10000__,Low_spent_Small_value_payments,__-333333333333333333333333333__,Standard
freq,1,8,12500,44,2833,5572,7062,16.0,14386,1408,5327,2091,36479,24.0,446,52326,4305,25513,9,53174


Note:
- Many columns have high cardinality.
- Columns like Payment_Behaviour and Type_of_Loan may be too granular or messy for a baseline model — we might drop them.
- Credit_History_Age will need to be converted to total months.
- Placeholder values like _ should be replaced or treated as missing.

In [7]:
df_train["Credit_Score"].value_counts()  # counts of each category in the target column

Credit_Score
Standard    53174
Poor        28998
Good        17828
Name: count, dtype: int64

Note:
- There's reasonable class balance, though Poor clearly has less.
- We'll map Credit_Score to a binary output: Poor to 1 (high risk) and Standard/Good to 0 (low risk)
- May need to monitor class imbalance during training (e.g., use class weights or stratified split

In [8]:
df_train.isna().sum().sort_values(ascending=False)  # count missing values per column

Monthly_Inhand_Salary       15002
Type_of_Loan                11408
Name                         9985
Credit_History_Age           9030
Num_of_Delayed_Payment       7002
Amount_invested_monthly      4479
Num_Credit_Inquiries         1965
Monthly_Balance              1200
ID                              0
Changed_Credit_Limit            0
Payment_Behaviour               0
Total_EMI_per_month             0
Payment_of_Min_Amount           0
Credit_Utilization_Ratio        0
Outstanding_Debt                0
Credit_Mix                      0
Delay_from_due_date             0
Customer_ID                     0
Num_of_Loan                     0
Interest_Rate                   0
Num_Credit_Card                 0
Num_Bank_Accounts               0
Annual_Income                   0
Occupation                      0
SSN                             0
Age                             0
Month                           0
Credit_Score                    0
dtype: int64

We can see that several columns have missing values.
- Columns with moderate missingness can be imputed (e.g., median for numeric, mode for categorical).
- Columns with high missingness and low modeling value (e.g., Name, Type_of_Loan) may be dropped.

In [9]:
df_train.duplicated().sum()  # count of fully duplicated rows

0

No duplicates.

# Data Cleaning

## Drop Irrelevant Columns

Some columns don't contribute to prediction, may introduce noise, or house personal information. We'll start by removing them.

In [10]:
# Drop columns that are irrelevant, sensitive, or too complex for baseline modeling
df_train.drop(columns=[
    "ID",                    # unique row identifier, not predictive
    "Customer_ID",           # repeated across months, not useful for modeling
    "Month",                 # temporal label, adds noise without time-series modeling
    "Name",                  # personal identifier, high cardinality, not predictive
    "SSN",                   # sensitive data, not useful for prediction
    "Type_of_Loan",          # multi-valued string, complex to encode
    "Payment_Behaviour"      # descriptive strings, hard to model without NLP
], inplace=True)

In [11]:
df_train.head()

Unnamed: 0,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,...,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Monthly_Balance,Credit_Score
0,23,Scientist,19114.12,1824.843333,3,4,3,4,3,7.0,...,4.0,_,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.41529543900253,312.49408867943663,Good
1,23,Scientist,19114.12,,3,4,3,4,-1,,...,4.0,Good,809.98,31.94496,,No,49.574949,118.28022162236736,284.62916249607184,Good
2,-500,Scientist,19114.12,,3,4,3,4,3,7.0,...,4.0,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521264648,331.2098628537912,Good
3,23,Scientist,19114.12,,3,4,3,4,5,4.0,...,4.0,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.4580743910713,223.45130972736783,Good
4,23,Scientist,19114.12,1824.843333,3,4,3,4,6,,...,4.0,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153086217326,341.48923103222177,Good


## Convert Object-type Numeric Columns

Now we'll fix the columns that are stored as object but should be numeric. These likely contain formatting issues (some of which we can see from the table above).

In [12]:
# List of columns that should be numeric but are currently object type
numeric_object_cols = [
    "Age",                        # age in years
    "Annual_Income",              # yearly income
    "Num_of_Loan",                # number of loans held
    "Num_of_Delayed_Payment",     # count of late payments
    "Changed_Credit_Limit",       # change in credit limit
    "Outstanding_Debt",           # total debt
    "Amount_invested_monthly",    # monthly investment amount
    "Monthly_Balance"             # leftover funds after expenses
]

# Convert each column to numeric, coercing errors to NaN
for col in numeric_object_cols:
    df_train[col] = pd.to_numeric(df_train[col], errors='coerce')  # convert strings to numbers, invalid entries become NaN

In [13]:
df_train[numeric_object_cols].isna().sum()  # check how many NaNs were introduced

Age                        4939
Annual_Income              6980
Num_of_Loan                4785
Num_of_Delayed_Payment     9746
Changed_Credit_Limit       2091
Outstanding_Debt           1009
Amount_invested_monthly    8784
Monthly_Balance            1209
dtype: int64

## Handling Missing Values

We'll fill in missing values based on column type and context:
- Numeric columns will be filled using the median
- Categorical columns will be filled with mode

In [14]:
# Fill missing values in numeric columns using the median
for col in df_train.select_dtypes(include=["float64", "int64"]).columns:
    df_train[col] = df_train[col].fillna(df_train[col].median())  # median is robust to outliers

In [15]:
# Fill missing values in categorical columns using the mode
df_train["Credit_Mix"] = df_train["Credit_Mix"].replace("_", np.nan)  # treat "_" as missing
df_train["Credit_Mix"] = df_train["Credit_Mix"].fillna(df_train["Credit_Mix"].mode()[0])  # fill with most common value
df_train["Payment_of_Min_Amount"] = df_train["Payment_of_Min_Amount"].fillna(df_train["Payment_of_Min_Amount"].mode()[0])  # fill with most common value

In [16]:
df_train.isna().sum().sort_values(ascending=False)  # confirm all missing values are handled

Credit_History_Age          9030
Age                            0
Num_Credit_Inquiries           0
Monthly_Balance                0
Amount_invested_monthly        0
Total_EMI_per_month            0
Payment_of_Min_Amount          0
Credit_Utilization_Ratio       0
Outstanding_Debt               0
Credit_Mix                     0
Changed_Credit_Limit           0
Occupation                     0
Num_of_Delayed_Payment         0
Delay_from_due_date            0
Num_of_Loan                    0
Interest_Rate                  0
Num_Credit_Card                0
Num_Bank_Accounts              0
Monthly_Inhand_Salary          0
Annual_Income                  0
Credit_Score                   0
dtype: int64

Credit_History_Age slipped through because it’s still in text format, so it wasn’t included in the numeric imputation loop. Let's handle it now as a special case before we start encoding.
- We'll convert values into total months, then fill in missing values with the median.

In [17]:
# Convert "X Years and Y Months" to total months
def convert_credit_age(age_str):
    try:
        parts = age_str.split()
        years = int(parts[0])        # extract years
        months = int(parts[3])       # extract months
        return years * 12 + months   # convert to total months
    except:
        return np.nan                # return NaN if format is invalid

In [18]:
# Apply conversion to the column
df_train["Credit_History_Age"] = df_train["Credit_History_Age"].apply(convert_credit_age)

In [19]:
# Fill missing values with median
df_train["Credit_History_Age"] = df_train["Credit_History_Age"].fillna(df_train["Credit_History_Age"].median())

In [20]:
df_train.isna().sum().sort_values(ascending=False)  # confirm all missing values are handled

Age                         0
Num_Credit_Inquiries        0
Monthly_Balance             0
Amount_invested_monthly     0
Total_EMI_per_month         0
Payment_of_Min_Amount       0
Credit_History_Age          0
Credit_Utilization_Ratio    0
Outstanding_Debt            0
Credit_Mix                  0
Changed_Credit_Limit        0
Occupation                  0
Num_of_Delayed_Payment      0
Delay_from_due_date         0
Num_of_Loan                 0
Interest_Rate               0
Num_Credit_Card             0
Num_Bank_Accounts           0
Monthly_Inhand_Salary       0
Annual_Income               0
Credit_Score                0
dtype: int64

In [21]:
df_train.head()

Unnamed: 0,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,...,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Monthly_Balance,Credit_Score
0,23.0,Scientist,19114.12,1824.843333,3,4,3,4.0,3,7.0,...,4.0,Standard,809.98,26.82262,265.0,No,49.574949,80.415295,312.494089,Good
1,23.0,Scientist,19114.12,3093.745,3,4,3,4.0,-1,14.0,...,4.0,Good,809.98,31.94496,219.0,No,49.574949,118.280222,284.629162,Good
2,-500.0,Scientist,19114.12,3093.745,3,4,3,4.0,3,7.0,...,4.0,Good,809.98,28.609352,267.0,No,49.574949,81.699521,331.209863,Good
3,23.0,Scientist,19114.12,3093.745,3,4,3,4.0,5,4.0,...,4.0,Good,809.98,31.377862,268.0,No,49.574949,199.458074,223.45131,Good
4,23.0,Scientist,19114.12,1824.843333,3,4,3,4.0,6,14.0,...,4.0,Good,809.98,24.797347,269.0,No,49.574949,41.420153,341.489231,Good


## Checkpoint

Now is a good time to reassess the dataset before we move forward.

In [22]:
df_train.dtypes  # shows current types for each column

Age                         float64
Occupation                   object
Annual_Income               float64
Monthly_Inhand_Salary       float64
Num_Bank_Accounts             int64
Num_Credit_Card               int64
Interest_Rate                 int64
Num_of_Loan                 float64
Delay_from_due_date           int64
Num_of_Delayed_Payment      float64
Changed_Credit_Limit        float64
Num_Credit_Inquiries        float64
Credit_Mix                   object
Outstanding_Debt            float64
Credit_Utilization_Ratio    float64
Credit_History_Age          float64
Payment_of_Min_Amount        object
Total_EMI_per_month         float64
Amount_invested_monthly     float64
Monthly_Balance             float64
Credit_Score                 object
dtype: object

In [23]:
df_train.shape  # returns (rows, columns)

(100000, 21)

In [24]:
df_train.isna().sum().sort_values(ascending=False)  # confirm all missing values are handled

Age                         0
Num_Credit_Inquiries        0
Monthly_Balance             0
Amount_invested_monthly     0
Total_EMI_per_month         0
Payment_of_Min_Amount       0
Credit_History_Age          0
Credit_Utilization_Ratio    0
Outstanding_Debt            0
Credit_Mix                  0
Changed_Credit_Limit        0
Occupation                  0
Num_of_Delayed_Payment      0
Delay_from_due_date         0
Num_of_Loan                 0
Interest_Rate               0
Num_Credit_Card             0
Num_Bank_Accounts           0
Monthly_Inhand_Salary       0
Annual_Income               0
Credit_Score                0
dtype: int64

# Encode Categorical Variables

Here, we'll convert the categorical variables (object dtype) into numeric format for our model.

In [25]:
# Encode 'Credit_Score' as binary classification target
# Map 'Poor' to 1 (high risk), and 'Standard'/'Good' to 0 (low risk)
df_train['Credit_Score'] = df_train['Credit_Score'].map(
    lambda x: 1 if x == 'Poor' else 0
)

# Confirm encoding
print("Encoded target values:", df_train['Credit_Score'].unique())  # should show [0, 1]

Encoded target values: [0 1]


In [26]:
# Identify all object-type columns (i.e., categorical features)
cat_cols = df_train.select_dtypes(include='object').columns.tolist()

# Step 3: Print the final list of categorical columns to encode
print("Categorical columns to encode:", cat_cols)

Categorical columns to encode: ['Occupation', 'Credit_Mix', 'Payment_of_Min_Amount']


In [27]:
# Initialize a dictionary to store encoders for each column
encoders = {}

# Loop through each categorical feature and apply label encoding
for col in cat_cols:
    # Create a new LabelEncoder instance
    le = LabelEncoder()
    
    # Fit the encoder to the column and transform the values
    df_train[col] = le.fit_transform(df_train[col])
    
    # Store the encoder for future use (e.g., test data transformation)
    encoders[col] = le

Notice that we're using the LabelEncoder here:
- It's fast and simple, converting with minimal overhead.
- It keeps the featurespace small
- It's very compatible with neural networks
- While it does imply an ordinal relationship, we'll be doing a neural network, which doesn't assume linear relationships between encoded values.
- The categorical features are mostly nominal, and we’re not relying on ordinal semantics.
- We’re keeping the pipeline lean and interpretable — no explosion of columns like with one-hot encoding

In [28]:
# Confirm encoding worked by checking data types
print(df_train.dtypes[cat_cols])

# Preview the encoded feature values
print(df_train[cat_cols].head())

Occupation               int32
Credit_Mix               int32
Payment_of_Min_Amount    int32
dtype: object
   Occupation  Credit_Mix  Payment_of_Min_Amount
0          12           2                      1
1          12           1                      1
2          12           1                      1
3          12           1                      1
4          12           1                      1


In [29]:
for col in ['Occupation', 'Credit_Mix', 'Payment_of_Min_Amount']:
    print(f"{col}: {df_train[col].nunique()} unique values")

Occupation: 16 unique values
Credit_Mix: 3 unique values
Payment_of_Min_Amount: 3 unique values


Looks like we were successful!

# Split the Dataset

Next up, we'll split up the dataset into training/validation sets

In [30]:
# Separate features and target 
X = df_train.drop(columns=['Credit_Score'])  # all features
y = df_train['Credit_Score']                 # binary target (0 or 1)

In [31]:
# Split into training (80%) and validation (20%) sets
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y  # stratify to preserve class balance
)

In [32]:
# Confirm the shapes of the splits
print("Training set shape:", X_train.shape)
print("Validation set shape:", X_val.shape)
print("Training target distribution:\n", y_train.value_counts(normalize=True))
print("Validation target distribution:\n", y_val.value_counts(normalize=True))

Training set shape: (80000, 20)
Validation set shape: (20000, 20)
Training target distribution:
 Credit_Score
0    0.710025
1    0.289975
Name: proportion, dtype: float64
Validation target distribution:
 Credit_Score
0    0.71
1    0.29
Name: proportion, dtype: float64


In [33]:
# Check class balance in the full dataset
print("Full dataset class balance (%):")
print(df_train['Credit_Score'].value_counts(normalize=True) * 100)

Full dataset class balance (%):
Credit_Score
0    71.002
1    28.998
Name: proportion, dtype: float64


# Feature Scaling

Here, we'll normalize all numeric features so they're on a similar scale. This will help our neural network to converge faster and more reliably.

In [34]:
# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the training data in one step
X_train_scaled = scaler.fit_transform(X_train)

# Transform the validation data using the same scaler
X_val_scaled = scaler.transform(X_val)

In [35]:
# Convert scaled arrays back to DataFrames for readability
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_val_scaled = pd.DataFrame(X_val_scaled, columns=X_val.columns, index=X_val.index)

# Export

In [36]:
# Save scaled training features to CSV (no index)
X_train_scaled.to_csv("X_train_scaled.csv", index=False)

# Save scaled validation features to CSV (no index)
X_val_scaled.to_csv("X_val_scaled.csv", index=False)

# Make sure y_train and y_val are Series with a name
y_train.name = "Credit_Score"
y_val.name = "Credit_Score"

# Save training target labels (no index)
y_train.to_csv("y_train.csv", index=False)

# Save validation target labels (no index)
y_val.to_csv("y_val.csv", index=False)

In [39]:
# Check Shape
print(f"X_train shape: {X_train_scaled.shape}")
print(f"X_val shape: {X_val_scaled.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_val shape: {y_val.shape}")

X_train shape: (80000, 20)
X_val shape: (20000, 20)
y_train shape: (80000,)
y_val shape: (20000,)


We use joblib to serialize the StandardScaler object. This allows us to apply the exact same scaling to future data (e.g., test sets or real-world inputs) without refitting

In [37]:
# Save the fitted scaler for future use (e.g., test data or deployment)
import joblib
joblib.dump(scaler, "standard_scaler.pkl")  # Saves as a binary file

['standard_scaler.pkl']

In [38]:
import os
print(os.getcwd())

C:\Users\barbe\OneDrive\URI DS Program\566 Advanced Topics in Machine Learning\Project\neural-credit-risk-explainer\notebooks
