# Explanatory Data Analysis

In [21]:
# Imports
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns

In [28]:
# Load Dataset
df = pd.read_csv('C:/Users/abdou/Documents/Data_Science_Projects/AbdoulT_DSPortfolio/Credit Risk Modeling Projects/Credit Default Prediction Using Logistic Regression and XGBoost/data/UCI_Credit_Card.csv', header=0)
df.rename(columns={'default payment next month': 'default'}, inplace=True)

In [29]:
# Quick Overview
print(df.shape)
print(df.dtypes)
df.head()

(30000, 25)
ID                              int64
LIMIT_BAL                     float64
SEX                             int64
EDUCATION                       int64
MARRIAGE                        int64
AGE                             int64
PAY_0                           int64
PAY_2                           int64
PAY_3                           int64
PAY_4                           int64
PAY_5                           int64
PAY_6                           int64
BILL_AMT1                     float64
BILL_AMT2                     float64
BILL_AMT3                     float64
BILL_AMT4                     float64
BILL_AMT5                     float64
BILL_AMT6                     float64
PAY_AMT1                      float64
PAY_AMT2                      float64
PAY_AMT3                      float64
PAY_AMT4                      float64
PAY_AMT5                      float64
PAY_AMT6                      float64
default.payment.next.month      int64
dtype: object


Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [30]:
# Check for Missing Values
missing_values = df.isnull().sum()
print("Missing values:\n", missing_values[missing_values > 0])

Missing values:
 Series([], dtype: int64)


In [None]:
# Rename Columns (Optional cleanup)
df.columns = [col.lower().replace(" ", "_") for col in df.columns]
df.rename(columns={'PAY_0': 'PAY_1'}, inplace=True)  # Match naming pattern

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns   

In [None]:
# Target Variable Distribution
sns.countplot(x='default', data=df)
plt.title("Target Variable Distribution")
plt.xlabel("Default (1 = Yes, 0 = No)")
plt.ylabel("Count")
plt.show()

In [None]:
# Summary Statistics
df.describe()

In [None]:
# Visualize the first few rows of the dataset
print(df.head())        

# Visualize the data types of each column
print(df.dtypes)

# Visualize the distribution of numerical features
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()

for feature in numerical_features:
    plt.figure(figsize=(10, 5))
    sns.histplot(df[feature], bins=30, kde=True)
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Frequency')
    plt.show()

# Visualize the correlation matrix
correlation_matrix = df.corr()
plt.figure(figsize=(12, 10))    

sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True)
plt.title('Correlation Matrix')
plt.show()

# Visualize the target variable distribution
plt.figure(figsize=(6, 4)) 
sns.countplot(x='default', data=df, palette='Set2')
plt.title('Distribution of Default Payments')
plt.xlabel('Default Payment Next Month')
plt.ylabel('Count')
plt.xticks([0, 1], ['No Default', 'Default'])
plt.show()

# Visualize the relationship between numerical features and the target variable
for feature in numerical_features:
    plt.figure(figsize=(10, 5))
    sns.boxplot(x='default', y=feature, data=df, palette='Set2')
    plt.title(f'{feature} vs Default Payment Next Month')
    plt.xlabel('Default Payment Next Month')
    plt.ylabel(feature)
    plt.xticks([0, 1], ['No Default', 'Default'])
    plt.show()

# Visualize the relationship between categorical features and the target variable
categorical_features = df.select_dtypes(include=['object']).columns.tolist()

for feature in categorical_features:
    plt.figure(figsize=(10, 5))
    sns.countplot(x=feature, hue='default', data=df, palette='Set2')
    plt.title(f'{feature} vs Default Payment Next Month')
    plt.xlabel(feature)
    plt.ylabel('Count')
    plt.legend(title='Default Payment Next Month', loc='upper right', labels=['No Default', 'Default'])
    plt.show()

# Visualize the distribution of categorical features
for feature in categorical_features:
    plt.figure(figsize=(10, 5))
    sns.countplot(x=feature, data=df, palette='Set2')
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Count')
    plt.show()

# Visualize the relationship between categorical features and numerical features
for feature in categorical_features:
    for num_feature in numerical_features:
        plt.figure(figsize=(10, 5))
        sns.boxplot(x=feature, y=num_feature, data=df, palette='Set2')
        plt.title(f'{num_feature} vs {feature}')
        plt.xlabel(feature)
        plt.ylabel(num_feature)
        plt.xticks(rotation=45)
        plt.show()

# Visualize the relationship between categorical features and the target variable
for feature in categorical_features:
    plt.figure(figsize=(10, 5))
    sns.countplot(x=feature, hue='default', data=df, palette='Set2')
    plt.title(f'{feature} vs Default Payment Next Month')
    plt.xlabel(feature)
    plt.ylabel('Count')
    plt.legend(title='Default Payment Next Month', loc='upper right', labels=['No Default', 'Default'])
    plt.xticks(rotation=45)
    plt.show()

# Visualize the relationship between numerical features and the target variable
for feature in numerical_features:
    plt.figure(figsize=(10, 5))
    sns.boxplot(x='default', y=feature, data=df, palette='Set2')
    plt.title(f'{feature} vs Default Payment Next Month')
    plt.xlabel('Default Payment Next Month')
    plt.ylabel(feature)
    plt.xticks([0, 1], ['No Default', 'Default'])
    plt.show()

# Visualize the relationship between categorical features and numerical features
for feature in categorical_features:
    for num_feature in numerical_features:
        plt.figure(figsize=(10, 5))
        sns.boxplot(x=feature, y=num_feature, data=df, palette='Set2')
        plt.title(f'{num_feature} vs {feature}')
        plt.xlabel(feature)
        plt.ylabel(num_feature)
        plt.xticks(rotation=45)
        plt.show()

# Visualize the relationship between categorical features and the target variable
for feature in categorical_features:
    plt.figure(figsize=(10, 5))
    sns.countplot(x=feature, hue='default', data=df, palette='Set2')
    plt.title(f'{feature} vs Default Payment Next Month')
    plt.xlabel(feature)
    plt.ylabel('Count')
    plt.legend(title='Default Payment Next Month', loc='upper right', labels=['No Default', 'Default'])
    plt.xticks(rotation=45)
    plt.show()

# Visualize the relationship between numerical features and the target variable
for feature in numerical_features:
    plt.figure(figsize=(10, 5))
    sns.boxplot(x='default', y=feature, data=df, palette='Set2')
    plt.title(f'{feature} vs Default Payment Next Month')
    plt.xlabel('Default Payment Next Month')
    plt.ylabel(feature)
    plt.xticks([0, 1], ['No Default', 'Default'])
    plt.show()