In [None]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE 
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
import os


df = pd.read_excel("/home/jui/thesis-code/data/credit_card_clients.xls")

0    23364
1     6636
Name: default payment next month, dtype: int64

In [20]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import xgboost as xgb
from packaging import version
import sklearn
from sklearn.metrics import accuracy_score, classification_report

def split_label(dataset, target_feature):
    X = dataset.drop([target_feature], axis=1)
    y = dataset[[target_feature]]
    return X, y


# Handle different scikit-learn versions for OneHotEncoder parameters
if version.parse(sklearn.__version__) < version.parse('1.2'):
    ohe_params = {"sparse": False}
else:
    ohe_params = {"sparse_output": False}

def create_classification_pipeline(X):
    pipe_cfg = {
        'num_cols': X.dtypes[X.dtypes == 'int64'].index.values.tolist(),
        'cat_cols': X.dtypes[X.dtypes == 'object'].index.values.tolist(),
    }
    num_pipe = Pipeline([ 
        ('num_imputer', SimpleImputer(strategy='median')),
        ('num_scaler', StandardScaler())
    ])
    cat_pipe = Pipeline([
        ('cat_imputer', SimpleImputer(strategy='constant', fill_value='?')),
        ('cat_encoder', OneHotEncoder(handle_unknown='ignore', **ohe_params))
    ])
    feat_pipe = ColumnTransformer([
        ('num_pipe', num_pipe, pipe_cfg['num_cols']),
        ('cat_pipe', cat_pipe, pipe_cfg['cat_cols'])
    ])

    xgb_model = xgb.XGBClassifier(
        tree_method='hist',  # Fast histogram-based training
        random_state=10,
        scale_pos_weight=1.5,
        n_jobs=-1,  # Use all CPU cores
        learning_rate=0.16,  # Controls step size
        reg_lambda=1.1,  # L2 regularization (weight decay)
        eval_metric='logloss',
        objective='binary:logistic',  # Logarithmic loss for classification
        use_label_encoder=False,  # Avoids unnecessary warnings
        n_estimators=125
    )

    pipeline = Pipeline(steps=[('preprocessor', feat_pipe),
                               ('model', xgb_model)])

    return pipeline, feat_pipe

In [21]:
target_feature = 'default payment next month'
categorical_features = []

# Specify how many class 1 samples you want
desired_class_1_count = 13000  

# Augment the data with synthetic samples for class 1
augmented_df = augment_data_with_synthetic(class_0_data, class_1_data, desired_class_1_count)

# Split data into features and target
X, y = split_label(augmented_df, target_feature)

# Split data into train and test sets 
X_train_og, X_test_og, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=40)

# Ensure y_train is a Series
y_train = y_train.squeeze()  # Converts DataFrame to Series if needed

rus = RandomUnderSampler(sampling_strategy=0.35, random_state=40)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train_og, y_train)
print(Counter(y_train_resampled))

features = X_train_og.columns.tolist()  # Extract feature names
classes = np.unique(y_train).tolist()  # Extract class labels

# Create the classification pipeline
pipeline, feat_pipe = create_classification_pipeline(X_train_og)

# Fit the preprocessor first
preprocessor = pipeline.named_steps['preprocessor']
preprocessor.fit(X_train_resampled)

# Apply transformations
X_train_transformed = preprocessor.transform(X_train_resampled)
X_test_transformed = preprocessor.transform(X_test_og)

# Extract transformed feature names safely
if hasattr(pipeline.named_steps['preprocessor'], "get_feature_names_out"):
    feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()
else:
    # Manually construct feature names (for older sklearn versions)
    num_cols = X_train_resampled.select_dtypes(include=['int64', 'float64']).columns.tolist()
    cat_cols = X_train_resampled.select_dtypes(include=['object']).columns.tolist()
    feature_names = num_cols + cat_cols  # Not perfect, but works if get_feature_names_out() is missing

# Convert y_train and y_test to NumPy arrays
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

# Extract XGBClassifier separately and fit with eval_set
xgb_model = pipeline.named_steps['model']
model = xgb_model.fit(preprocessor.transform(X_train_resampled), y_train_resampled, 
              eval_set=[(preprocessor.transform(X_train_resampled), y_train_resampled),
                        (preprocessor.transform(X_test_og), y_test)], verbose=False)

# Make predictions
y_pred = xgb_model.predict(preprocessor.transform(X_test_og))

# Compute accuracy and classification report
accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Model Accuracy: {accuracy:.4f}")
print(f"Classification Report:\n{class_report}\n")

# Extract evaluation results
evals_result = xgb_model.evals_result()

# Get the final log loss for training and testing
train_log_loss = evals_result['validation_0']['logloss'][-1]
test_log_loss = evals_result['validation_1']['logloss'][-1]

# Print the final log loss values for both training and validation
print(f"Final Training Log Loss: {train_log_loss:.4f}")
print(f"Final Test Log Loss: {test_log_loss:.4f}")

# Plot training and validation log loss
plt.figure(figsize=(10, 5))
plt.plot(evals_result['validation_0']['logloss'], label='Train Log Loss', color='blue')
plt.plot(evals_result['validation_1']['logloss'], label='Test Log Loss', color='red')
plt.xlabel("Epochs")
plt.ylabel("Log Loss")
plt.title("XGBoost Training Progress (Log Loss)")
plt.legend()
plt.show()

Generating 6364 synthetic samples for class 1.


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 105s/step




0 [D loss: 0.6745] [G loss: 0.6424]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━

ValueError: Length of values (6364) does not match length of index (24)

In [4]:
from raiwidgets import ResponsibleAIDashboard
from responsibleai import RAIInsights

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
2025-03-03 12:08:18.728185: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741032498.741986  113457 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741032498.746299  113457 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-03 12:08:18.760509: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [14]:
from responsibleai.feature_metadata import FeatureMetadata
import seaborn as sns
import matplotlib.pyplot as plt

# Set up feature metadata for RAIInsights
feature_metadata = FeatureMetadata(categorical_features=categorical_features, dropped_features=[])

# Add the target feature back to the datasets
X_train_og_with_target = X_train_og.copy()
X_train_og_with_target[target_feature] = y_train

X_test_og_with_target = X_test_og.copy()
X_test_og_with_target[target_feature] = y_test

#X_test_og_with_target = X_test_og_with_target.sample(n=5000, random_state=10)

# Specify the stratification variable
stratify_by = 'SEX'

# Split the data into training and testing sets, with stratification
train, test = train_test_split(X_train_og_with_target, test_size=0.3, stratify=X_train_og_with_target[stratify_by])
sample_df, _ = train_test_split(
    train, 
    train_size=5000/len(train),  # Ensure we get exactly 5000 samples
    stratify=train[stratify_by],  # Maintain distribution
    random_state=42
)

# Check the distribution of the stratification variable in the training and testing sets
print("Train dataset:\n", sample_df[stratify_by].value_counts())

Train dataset:
 2    3013
1    1987
Name: SEX, dtype: int64


In [15]:
# Now, pass these modified DataFrames to RAIInsights
rai_insights = RAIInsights(model, X_train_og_with_target, sample_df, target_feature, 'classification', feature_metadata=feature_metadata)

In [16]:
from raiwidgets import ErrorAnalysisDashboard
y_test = y_test.flatten()
# Interpretability
rai_insights.explainer.add()
# Error Analysis
rai_insights.error_analysis.add()
# Counterfactuals: accepts total number of counterfactuals to generate, the label that they should have, and a list of 
                # strings of categorical feature names
#rai_insights.counterfactual.add(total_CFs=10, desired_class='opposite')

In [17]:
# Compute: Perform all tasks (this remains CPU-bound)
rai_insights.compute()

Causal Effects
Current Status: Generating Causal Effects.
Current Status: Finished generating causal effects.
Time taken: 0.0 min 2.0116101950407028e-05 sec
Counterfactual
Time taken: 0.0 min 6.076996214687824e-06 sec
Error Analysis
Current Status: Generating error analysis reports.
Current Status: Finished generating error analysis reports.
Time taken: 0.0 min 0.3481035960139707 sec
Explanations
Current Status: Explaining 24 features
Current Status: Explained 24 features.
Time taken: 0.0 min 0.804348086938262 sec


In [18]:
ResponsibleAIDashboard(rai_insights)

ResponsibleAI started at http://localhost:8707


<raiwidgets.responsibleai_dashboard.ResponsibleAIDashboard at 0x7adb9c825de0>