In [51]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from raimitigations.dataprocessing import Rebalance  

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier


df = pd.read_csv("/home/jui/thesis-code/data/bank-full.csv",sep = ';')
print(df.dtypes)
num_features = [
    'age',        # Client's age (integer)
    'balance',    # Account balance (integer)
    'day',        # Day of month (1-31)
    'duration',   # Last contact duration in seconds
    'campaign',   # Number of contacts during this campaign
    'pdays',      # Number of days since last contact (999=not contacted)
    #'previous'
]

cat_features = [
    'job',        # Type of job (e.g., "admin", "technician")
    'marital',    # Marital status ("married", "single", "divorced")
    'education',  # Education level
    'housing',    # Has housing loan? ("yes", "no")
    'loan',       # Has personal loan? ("yes", "no")
    'month',      # Last contact month (e.g., "may", "jun")
    'poutcome',
    #'contact',
    #'default'  
]

target_feature = 'y'

print(df['y'].value_counts())

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object
no     39922
yes     5289
Name: y, dtype: int64


In [52]:
def split_label(df):
    y = df["y"].map({"no":0, "yes":1})
    X = df.drop("y", axis=1)
    return X, y

preprocessor = ColumnTransformer([("numerical", "passthrough", num_features), 
                                ("categorical", OneHotEncoder(sparse_output=False, handle_unknown="ignore"),
                                cat_features)])

rebalance = Rebalance(
				df=df,
				rebalance_col='y',
				k_neighbors=7,
				verbose=False
			)
df = rebalance.fit_resample()
print(df['y'].value_counts())

df = df.drop(columns=['default'])
df = df.drop(columns=['contact'])
df = df.drop(columns=['previous'])

X, y = split_label(df)

# Split data into train and test sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=40)
print(df.head())

`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.


no     39922
yes    39922
Name: y, dtype: int64
   age           job  marital  education  balance housing loan  day month  \
0   58    management  married   tertiary     2143     yes   no    5   may   
1   44    technician   single  secondary       29     yes   no    5   may   
2   33  entrepreneur  married  secondary        2     yes  yes    5   may   
3   47   blue-collar  married    unknown     1506     yes   no    5   may   
4   33       unknown   single    unknown        1      no   no    5   may   

   duration  campaign  pdays poutcome   y  
0       261         1     -1  unknown  no  
1       151         1     -1  unknown  no  
2        76         1     -1  unknown  no  
3        92         1     -1  unknown  no  
4       198         1     -1  unknown  no  


In [50]:
# XGBoost
xgb_model = Pipeline([("preprocessor", preprocessor), 
                    ("model", XGBClassifier(scale_pos_weight=(1 - y.mean()), n_jobs=-1))])


gs = GridSearchCV(xgb_model, {"model__max_depth": [5, 10],
                              "model__min_child_weight": [5, 10],
                              "model__n_estimators": [100]},
                  n_jobs=-1, cv=5, scoring="accuracy")

gs.fit(X_train, y_train)

print(gs.best_params_)
print(gs.best_score_)
xgb_model.set_params(**gs.best_params_)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)


# Compute accuracy and classification report
accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Model Accuracy: {accuracy:.4f}")
print(f"Classification Report:\n{class_report}\n")


ValueError: 
All the 20 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/home/jui/.pyenv/versions/3.10.12/envs/myenv/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 3802, in get_loc
    return self._engine.get_loc(casted_key)
  File "pandas/_libs/index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 165, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 5745, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 5753, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'previous'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/jui/.pyenv/versions/3.10.12/envs/myenv/lib/python3.10/site-packages/sklearn/utils/__init__.py", line 447, in _get_column_indices
    col_idx = all_columns.get_loc(col)
  File "/home/jui/.pyenv/versions/3.10.12/envs/myenv/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 3804, in get_loc
    raise KeyError(key) from err
KeyError: 'previous'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/jui/.pyenv/versions/3.10.12/envs/myenv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/jui/.pyenv/versions/3.10.12/envs/myenv/lib/python3.10/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/jui/.pyenv/versions/3.10.12/envs/myenv/lib/python3.10/site-packages/sklearn/pipeline.py", line 423, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/home/jui/.pyenv/versions/3.10.12/envs/myenv/lib/python3.10/site-packages/sklearn/pipeline.py", line 377, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/home/jui/.pyenv/versions/3.10.12/envs/myenv/lib/python3.10/site-packages/joblib/memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
  File "/home/jui/.pyenv/versions/3.10.12/envs/myenv/lib/python3.10/site-packages/sklearn/pipeline.py", line 957, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/home/jui/.pyenv/versions/3.10.12/envs/myenv/lib/python3.10/site-packages/sklearn/utils/_set_output.py", line 157, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "/home/jui/.pyenv/versions/3.10.12/envs/myenv/lib/python3.10/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/jui/.pyenv/versions/3.10.12/envs/myenv/lib/python3.10/site-packages/sklearn/compose/_column_transformer.py", line 751, in fit_transform
    self._validate_column_callables(X)
  File "/home/jui/.pyenv/versions/3.10.12/envs/myenv/lib/python3.10/site-packages/sklearn/compose/_column_transformer.py", line 459, in _validate_column_callables
    transformer_to_input_indices[name] = _get_column_indices(X, columns)
  File "/home/jui/.pyenv/versions/3.10.12/envs/myenv/lib/python3.10/site-packages/sklearn/utils/__init__.py", line 455, in _get_column_indices
    raise ValueError("A given column is not a column of the dataframe") from e
ValueError: A given column is not a column of the dataframe


In [46]:
from raiwidgets import ResponsibleAIDashboard
from responsibleai import RAIInsights
from responsibleai.feature_metadata import FeatureMetadata
import seaborn as sns
from raiwidgets import ErrorAnalysisDashboard
import matplotlib.pyplot as plt

# Set up feature metadata for RAIInsights
feature_metadata = FeatureMetadata(categorical_features=cat_features, dropped_features=[])

# Add the target feature back to the datasets
X_train_og_with_target = X_train.copy()
X_train_og_with_target[target_feature] = y_train

X_test_og_with_target = X_test.copy()
X_test_og_with_target[target_feature] = y_test

sample_df = X_test_og_with_target.sample(n=5000, random_state=10)

# Now, pass these modified DataFrames to RAIInsights
rai_insights = RAIInsights(xgb_model, X_train_og_with_target, sample_df, target_feature, 'classification', feature_metadata=feature_metadata)


# Interpretability
rai_insights.explainer.add()
# Error Analysis
rai_insights.error_analysis.add()

# Compute: Perform all tasks (this remains CPU-bound)
rai_insights.compute()
ResponsibleAIDashboard(rai_insights)

Causal Effects
Current Status: Generating Causal Effects.
Current Status: Finished generating causal effects.
Time taken: 0.0 min 2.1799001842737198e-05 sec
Counterfactual
Time taken: 0.0 min 7.843831554055214e-06 sec
Error Analysis
Current Status: Generating error analysis reports.
Current Status: Finished generating error analysis reports.
Time taken: 0.0 min 0.2692894679494202 sec
Explanations
Current Status: Explaining 16 features


categorical_feature keyword has been found in `params` and will be ignored.
Please use categorical_feature argument of the Dataset constructor to pass this parameter.


Current Status: Explained 16 features.
Time taken: 0.0 min 0.9708413798362017 sec
ResponsibleAI started at http://localhost:8708


<raiwidgets.responsibleai_dashboard.ResponsibleAIDashboard at 0x751e886c74f0>