# Feature Engineering

In [None]:
# Import packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

In [15]:
# Load the dataset
df = pd.read_csv('C:/Users/abdou/Documents/Data_Science_Projects/AbdoulT_DSPortfolio/Credit Risk Modeling Projects/Credit Default Prediction Using Logistic Regression and XGBoost/data/cleaned_credit_data.csv')

# Features Engineered
pay_avg: Avg. of past due payment statuses

bill_avg: Avg. of past 6 bill amounts

pay_amt_avg: Avg. of past 6 payments made

credit_utilization: Ratio of the average amount paid to the average bill amount, showing how much of their credit card bill a client typically pays off. 
It is an important indicator of credit risk and financial responsibility.

utilization: Proxy for credit risk behavior. It measures what proportion of the available credit limit was used by the client in the latest month.

Standard scaling and SMOTE applied

In [16]:
# Target Variable
target = 'default'

df['pay_avg'] = df[['pay_1', 'pay_2', 'pay_3', 'pay_4', 'pay_5', 'pay_6']].mean(axis=1)
df['bill_avg'] = df[['bill_amt1', 'bill_amt2', 'bill_amt3', 'bill_amt4', 'bill_amt5', 'bill_amt6']].mean(axis=1)
df['pay_amt_avg'] = df[['pay_amt1', 'pay_amt2', 'pay_amt3', 'pay_amt4', 'pay_amt5', 'pay_amt6']].mean(axis=1)

# Cedit Utilization
df['credit_utilization'] = df['pay_amt_avg'] / df['bill_avg']

# Credit Utilization Ratio for most recent month
df['utilization'] = df['bill_amt1'] / (df['limit_bal'] + 1)

# Drop ID column and irrelevant features
df.drop(columns=['id'], errors='ignore', inplace=True)

# Drop EDA columns
eda_columns = ['education_level', 'relationship_status', 'age_group', 'credit_card_limit']
df.drop(columns=eda_columns, errors='ignore', inplace=True)

In [18]:
# Define Features and Target
x = df.drop(columns=[target])
y = df[target] 

# Handle infinite values and NaNs before scaling
x = x.replace([np.inf, -np.inf], np.nan)
x = x.fillna(x.mean())

# Feature Scaling
scaler = StandardScaler()
x_scaled = pd.DataFrame(scaler.fit_transform(x), columns=x.columns)

# Apply SMOTE for handling class imbalance
smote = SMOTE(random_state=85)
X_resampled, y_resampled = smote.fit_resample(x_scaled, y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=85, stratify=y_resampled)

# Save the processed data
X_train.to_csv('C:/Users/abdou/Documents/Data_Science_Projects/AbdoulT_DSPortfolio/Credit Risk Modeling Projects/Credit Default Prediction Using Logistic Regression and XGBoost/data/X_train.csv', index=False)
X_test.to_csv('C:/Users/abdou/Documents/Data_Science_Projects/AbdoulT_DSPortfolio/Credit Risk Modeling Projects/Credit Default Prediction Using Logistic Regression and XGBoost/data/X_test.csv', index=False)
y_train.to_csv('C:/Users/abdou/Documents/Data_Science_Projects/AbdoulT_DSPortfolio/Credit Risk Modeling Projects/Credit Default Prediction Using Logistic Regression and XGBoost/data/y_train.csv', index=False)
y_test.to_csv('C:/Users/abdou/Documents/Data_Science_Projects/AbdoulT_DSPortfolio/Credit Risk Modeling Projects/Credit Default Prediction Using Logistic Regression and XGBoost/data/y_test.csv', index=False)

# Save the processed data to a new CSV file
df.to_csv('C:/Users/abdou/Documents/Data_Science_Projects/AbdoulT_DSPortfolio/Credit Risk Modeling Projects/Credit Default Prediction Using Logistic Regression and XGBoost/data/processed_credit_data.csv', index=False)

# Display the first few rows of the processed DataFrame
print(df.head())

# Display the shape of the training and testing sets
print(f"Training set shape: {X_train.shape}, {y_train.shape}")
print(f"Testing set shape: {X_test.shape}, {y_test.shape}")

   limit_bal  sex  education  marriage  age  pay_1  pay_2  pay_3  pay_4  \
0    20000.0    2          2         1   24      2      2     -1     -1   
1   120000.0    2          2         2   26     -1      2      0      0   
2    90000.0    2          2         2   34      0      0      0      0   
3    50000.0    2          2         1   37      0      0      0      0   
4    50000.0    1          2         1   57     -1      0     -1      0   

   pay_5  ...  pay_amt3  pay_amt4  pay_amt5  pay_amt6  default   pay_avg  \
0     -2  ...       0.0       0.0       0.0       0.0        1 -0.333333   
1      0  ...    1000.0    1000.0       0.0    2000.0        1  0.500000   
2      0  ...    1000.0    1000.0    1000.0    5000.0        0  0.000000   
3      0  ...    1200.0    1100.0    1069.0    1000.0        0  0.000000   
4      0  ...   10000.0    9000.0     689.0     679.0        0 -0.333333   

       bill_avg  pay_amt_avg  credit_utilization  utilization  
0   1284.000000   114.833333