In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_curve, auc
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import shap
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [None]:
# Load the SAS dataset
file_path = 'C:/Users/khtur/Desktop/Eas/Models/abt_app.sas7bdat'  # Replace with your file path
data = pd.read_sas(file_path)

# Separate features and target
X = data.drop(columns='default12')  # Replace 'default12' with your actual target column
y = data['default12']

# Drop rows where the target variable is NaN
X_cleaned = X[~y.isna()]
y_cleaned = y[~y.isna()]

# Encode the target to ensure it's binary (0 and 1)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_cleaned)  # Convert target values to 0 and 1.

# Check class imbalance after encoding
print(np.bincount(y_encoded))  # Display counts of 0s and 1s


In [None]:
from sklearn.impute import SimpleImputer

# Step 1: Separate numeric and categorical columns
numeric_cols = X_cleaned.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X_cleaned.select_dtypes(include=[object]).columns.tolist()

# Step 2: Define the preprocessing pipeline with imputation
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),  # Fill missing values for numeric features
            ('scaler', StandardScaler())]  # Scale numeric features
        ), numeric_cols),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill missing values for categorical features
            ('encoder', OneHotEncoder(drop='first'))]  # One-hot encode categorical features
        ), categorical_cols)
    ])

# Step 3: Apply the transformations
X_transformed = preprocessor.fit_transform(X_cleaned)

# Convert to dense matrix (if necessary)
if isinstance(X_transformed, np.ndarray) is False:
    X_transformed = X_transformed.toarray()  # Convert sparse matrix to dense

# Step 4: Split data into train and validation sets
X_train_scaled, X_val_scaled, y_train, y_val = train_test_split(X_transformed, y_encoded, test_size=0.2, random_state=42)

# Print shapes of transformed data
print(f"Train data shape: {X_train_scaled.shape}")
print(f"Validation data shape: {X_val_scaled.shape}")


In [None]:
from sklearn.decomposition import PCA

# Define the number of principal components to retain (e.g., 100 components)
pca = PCA(n_components=100)

# Fit PCA on the training data and transform both train and validation sets
X_train_pca = pca.fit_transform(X_train_scaled)
X_val_pca = pca.transform(X_val_scaled)

# Check the explained variance to ensure we're capturing enough information
explained_variance = pca.explained_variance_ratio_.sum()
print(f"Total explained variance by 100 components: {explained_variance:.2f}")

# Print the new shape of the transformed dataset
print(f"Shape of PCA-transformed train data: {X_train_pca.shape}")
print(f"Shape of PCA-transformed validation data: {X_val_pca.shape}")


In [None]:
from tensorflow.keras.regularizers import l2

def build_and_train_model(X_train, y_train, X_val, y_val):
    model = Sequential([
        Input(shape=(X_train.shape[1],)),  # Input size is now the reduced number of components
        Dense(128, activation='relu', kernel_regularizer=l2(0.01)),  # Added L2 regularization
        Dropout(0.4),  # Increased dropout to help prevent overfitting
        Dense(64, activation='relu', kernel_regularizer=l2(0.01)),
        Dropout(0.4),
        Dense(32, activation='relu', kernel_regularizer=l2(0.01)),
        Dense(1, activation='sigmoid')
    ])
    
    model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True, verbose=1)
    
    # Calculate class weights based on encoded labels
    class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
    class_weight_dict = {int(cls): weight for cls, weight in zip(np.unique(y_train), class_weights)}
    
    # Train the model
    history = model.fit(X_train, y_train, validation_data=(X_val, y_val), 
                        epochs=100, batch_size=32, callbacks=[early_stopping], 
                        class_weight=class_weight_dict, verbose=1)
    
    print(f"Training stopped at epoch {len(history.epoch)} due to early stopping")
    
    return model, history

# Build and train the model using PCA-transformed data
model, history = build_and_train_model(X_train_pca, y_train, X_val_pca, y_val)


In [None]:
# Predict probabilities for validation set
y_val_prob = model.predict(X_val_pca).ravel()

# Calculate ROC curve
fpr, tpr, _ = roc_curve(y_val, y_val_prob)

# Calculate AUC
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='blue', label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.show()


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# Train Logistic Regression on PCA-transformed data
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train_pca, y_train)

# Predict probabilities for validation set
y_val_prob_log = log_model.predict_proba(X_val_pca)[:, 1]

# Calculate AUC for Logistic Regression
log_auc = roc_auc_score(y_val, y_val_prob_log)
print(f'Logistic Regression AUC: {log_auc:.2f}')

# Plot ROC curve for Logistic Regression
fpr_log, tpr_log, _ = roc_curve(y_val, y_val_prob_log)
plt.plot(fpr_log, tpr_log, color='green', label=f'Logistic Regression ROC (AUC = {log_auc:.2f})')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Logistic Regression')
plt.legend(loc='lower right')
plt.show()
