In [1]:
import pandas as pd
from pgmpy.models import DiscreteBayesianNetwork
from pgmpy.estimators import HillClimbSearch, BayesianEstimator
from pgmpy.inference import VariableElimination

#### Dataset
1.	Source: https://www.kaggle.com/datasets/uciml/default-of-credit-card-clients-dataset 
2.	Description
This dataset contains information on default payments, demographic factors, credit data, history of payment, and bill statements of credit card clients in Taiwan from April 2005 to September 2005.
3.	Instances: 30,000 rows
4.  Dataset in excel format 'default of credit card clients.xls' is also included in the zip file.

#### Data Dictionary
This research employed a binary variable, default payment (Yes = 1, No = 0), as the response variable. This study reviewed the literature and used the following 23 variables as explanatory variables:
- X1: Amount of the given credit (NT dollar): it includes both the individual consumer credit and his/her family (supplementary) credit.
- X2: Gender (1 = male; 2 = female).
- X3: Education (1 = graduate school; 2 = university; 3 = high school; 4 = others).
- X4: Marital status (1 = married; 2 = single; 3 = others).
- X5: Age (year).
- X6 - X11: History of past payment. 
    - We tracked the past monthly payment records (from April to September, 2005) as follows: X6 = the repayment status in September, 2005; X7 = the repayment status in August, 2005; . . .;X11 = the repayment status in April, 2005. 
    - The measurement scale for the repayment status is: -1 = pay duly; 1 = payment delay for one month; 2 = payment delay for two months; . . .; 8 = payment delay for eight months; 9 = payment delay for nine months and above.
    - (To be confirmed) -2: no paying balance in the first place; 0: pay part of the due balance (might be the minimum amount)
- X12-X17: Amount of bill statement (NT dollar). 
    - X12 = amount of bill statement in September, 2005; X13 = amount of bill statement in August, 2005; . . .; X17 = amount of bill statement in April, 2005. 
- X18-X23: Amount of previous payment (NT dollar). 
    - X18 = amount paid in September, 2005; X19 = amount paid in August, 2005; . . .;X23 = amount paid in April, 2005.

#### Area to tune
- The way to cut the continuous variable
    - how many bins
    - how to cut (interval or quantile)

#### Import Data

In [2]:
df = pd.read_excel('default of credit card clients.xls', index_col=0, header=1)
df.head()

Unnamed: 0_level_0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
2,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
3,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
4,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
5,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30000 entries, 1 to 30000
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   LIMIT_BAL                   30000 non-null  int64
 1   SEX                         30000 non-null  int64
 2   EDUCATION                   30000 non-null  int64
 3   MARRIAGE                    30000 non-null  int64
 4   AGE                         30000 non-null  int64
 5   PAY_0                       30000 non-null  int64
 6   PAY_2                       30000 non-null  int64
 7   PAY_3                       30000 non-null  int64
 8   PAY_4                       30000 non-null  int64
 9   PAY_5                       30000 non-null  int64
 10  PAY_6                       30000 non-null  int64
 11  BILL_AMT1                   30000 non-null  int64
 12  BILL_AMT2                   30000 non-null  int64
 13  BILL_AMT3                   30000 non-null  int64
 14  BILL_AMT4  

In [4]:
for col in df.columns:
    print(f'Unique instance in {col:<27}: {len(df[col].unique())}')

Unique instance in LIMIT_BAL                  : 81
Unique instance in SEX                        : 2
Unique instance in EDUCATION                  : 7
Unique instance in MARRIAGE                   : 4
Unique instance in AGE                        : 56
Unique instance in PAY_0                      : 11
Unique instance in PAY_2                      : 11
Unique instance in PAY_3                      : 11
Unique instance in PAY_4                      : 11
Unique instance in PAY_5                      : 10
Unique instance in PAY_6                      : 10
Unique instance in BILL_AMT1                  : 22723
Unique instance in BILL_AMT2                  : 22346
Unique instance in BILL_AMT3                  : 22026
Unique instance in BILL_AMT4                  : 21548
Unique instance in BILL_AMT5                  : 21010
Unique instance in BILL_AMT6                  : 20604
Unique instance in PAY_AMT1                   : 7943
Unique instance in PAY_AMT2                   : 7899
Unique insta

#### Data Cleaning

##### Discretize continuous variable

In [5]:
# AGE based on common standard in demographic statistcs
df['AGE'] = pd.cut(df['AGE'], bins=[21, 35, 51, 65, max(df['AGE'])],include_lowest=True, labels=list(range(1, 5))) # 1 youngest, 4 oldest

# Rest of the continuous column using quantile cut
bins = 10 
for col in df.columns[11:-1].to_list()+['LIMIT_BAL']: # columns for bill amount and pay amount
    df[col] = pd.qcut(df[col], q=bins, duplicates='drop')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30000 entries, 1 to 30000
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype   
---  ------                      --------------  -----   
 0   LIMIT_BAL                   30000 non-null  category
 1   SEX                         30000 non-null  int64   
 2   EDUCATION                   30000 non-null  int64   
 3   MARRIAGE                    30000 non-null  int64   
 4   AGE                         30000 non-null  category
 5   PAY_0                       30000 non-null  int64   
 6   PAY_2                       30000 non-null  int64   
 7   PAY_3                       30000 non-null  int64   
 8   PAY_4                       30000 non-null  int64   
 9   PAY_5                       30000 non-null  int64   
 10  PAY_6                       30000 non-null  int64   
 11  BILL_AMT1                   30000 non-null  category
 12  BILL_AMT2                   30000 non-null  category
 13  BILL_AMT3            

In [7]:
for col in df.select_dtypes('category').columns:
    cat_num = len(df[col].cat.categories)
    df[col] = df[col].cat.rename_categories(list(range(1, cat_num+1)))
    print(f'{col}: {cat_num}')

LIMIT_BAL: 10
AGE: 4
BILL_AMT1: 10
BILL_AMT2: 10
BILL_AMT3: 10
BILL_AMT4: 10
BILL_AMT5: 10
BILL_AMT6: 10
PAY_AMT1: 9
PAY_AMT2: 9
PAY_AMT3: 9
PAY_AMT4: 8
PAY_AMT5: 8
PAY_AMT6: 8


In [8]:
df.head()

Unnamed: 0_level_0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,2,2,1,1,2,2,-1,-1,-2,...,1,1,1,1,2,1,1,1,1,1
2,5,2,2,2,1,-1,2,0,0,0,...,3,3,4,1,2,3,2,1,4,1
3,4,2,2,2,1,0,0,0,0,0,...,5,5,5,3,3,3,2,2,6,0
4,2,2,2,1,2,0,0,0,0,0,...,6,7,7,4,5,3,3,3,2,0
5,2,1,2,1,3,-1,0,-1,0,0,...,6,6,6,4,9,8,7,2,2,0


##### Split Data into Training set and Tesing set

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.loc[:, df.columns != 'default payment next month'], df['default payment next month'], 
                                                    test_size=0.3, train_size=0.7, random_state=42, stratify = df['default payment next month'])

In [10]:
df_train = pd.merge(X_train, y_train, left_index=True, right_index=True)
df_train

Unnamed: 0_level_0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11019,6,2,2,2,1,0,0,0,0,0,...,10,10,10,7,7,8,7,7,7,0
1711,4,2,1,2,1,0,0,0,0,2,...,8,8,8,7,8,8,1,5,4,1
4619,4,1,1,2,1,0,0,0,0,0,...,7,8,8,6,5,4,4,4,4,0
5483,1,2,2,1,1,0,0,-1,-1,-2,...,1,1,1,2,2,1,1,1,1,0
26188,2,2,1,2,1,-2,-2,-2,-2,-2,...,1,1,1,4,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25781,7,2,2,1,1,0,0,-2,-2,-2,...,1,1,1,1,1,1,1,1,1,0
13922,5,2,2,2,1,0,0,0,0,0,...,9,10,10,6,7,7,8,6,1,0
3795,5,2,1,2,1,0,0,0,0,0,...,8,7,6,6,5,5,4,3,2,0
27566,9,1,1,1,3,1,-2,-1,-1,-1,...,2,1,1,1,2,2,1,1,1,0


#### Learn the Bayesian Network Structure

In [11]:
hc = HillClimbSearch(df_train)
best_model = hc.estimate(scoring_method='bic-d')

INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'LIMIT_BAL': 'O', 'SEX': 'N', 'EDUCATION': 'N', 'MARRIAGE': 'N', 'AGE': 'O', 'PAY_0': 'N', 'PAY_2': 'N', 'PAY_3': 'N', 'PAY_4': 'N', 'PAY_5': 'N', 'PAY_6': 'N', 'BILL_AMT1': 'O', 'BILL_AMT2': 'O', 'BILL_AMT3': 'O', 'BILL_AMT4': 'O', 'BILL_AMT5': 'O', 'BILL_AMT6': 'O', 'PAY_AMT1': 'O', 'PAY_AMT2': 'O', 'PAY_AMT3': 'O', 'PAY_AMT4': 'O', 'PAY_AMT5': 'O', 'PAY_AMT6': 'O', 'default payment next month': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'LIMIT_BAL': 'O', 'SEX': 'N', 'EDUCATION': 'N', 'MARRIAGE': 'N', 'AGE': 'O', 'PAY_0': 'N', 'PAY_2': 'N', 'PAY_3': 'N', 'PAY_4': 'N', 'PAY_5': 'N', 'PAY_6': 'N', 'BILL_AMT1': 'O', 'BILL_AMT2': 'O', 'BILL_AMT3': 'O', 'BILL_AMT4': 'O', 'BILL_AMT5': 'O', 'BILL_AMT6': 'O', 'PAY_AMT1': 'O', 'PAY_AMT2': 'O', 'PAY_AMT3': 'O', 'PAY_AMT4': 'O', 'PAY_AMT5': 'O', 'PAY_AMT6': 'O', 'default pa

  0%|          | 0/1000000 [00:00<?, ?it/s]

In [12]:
best_model.edges()

OutEdgeView([('LIMIT_BAL', 'EDUCATION'), ('LIMIT_BAL', 'SEX'), ('SEX', 'MARRIAGE'), ('EDUCATION', 'AGE'), ('AGE', 'MARRIAGE'), ('PAY_0', 'default payment next month'), ('PAY_2', 'PAY_0'), ('PAY_2', 'PAY_3'), ('PAY_2', 'PAY_AMT1'), ('PAY_3', 'PAY_AMT2'), ('PAY_4', 'PAY_2'), ('PAY_4', 'PAY_AMT3'), ('PAY_5', 'PAY_4'), ('PAY_5', 'PAY_6'), ('PAY_5', 'PAY_AMT4'), ('PAY_6', 'PAY_AMT5'), ('BILL_AMT1', 'BILL_AMT2'), ('BILL_AMT1', 'LIMIT_BAL'), ('BILL_AMT2', 'BILL_AMT3'), ('BILL_AMT2', 'PAY_AMT1'), ('BILL_AMT3', 'BILL_AMT4'), ('BILL_AMT3', 'PAY_AMT2'), ('BILL_AMT4', 'BILL_AMT5'), ('BILL_AMT4', 'PAY_AMT3'), ('BILL_AMT4', 'PAY_5'), ('BILL_AMT5', 'BILL_AMT6'), ('BILL_AMT5', 'PAY_AMT4'), ('BILL_AMT6', 'PAY_AMT5'), ('BILL_AMT6', 'PAY_AMT6'), ('PAY_AMT5', 'PAY_AMT6'), ('default payment next month', 'LIMIT_BAL')])

#### Create Bayesian Model and Learn CPDs

In [13]:
model = DiscreteBayesianNetwork(best_model.edges())
model.fit(df_train, estimator=BayesianEstimator, prior_type='BDeu')

INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'LIMIT_BAL': 'O', 'SEX': 'N', 'EDUCATION': 'N', 'MARRIAGE': 'N', 'AGE': 'O', 'PAY_0': 'N', 'PAY_2': 'N', 'PAY_3': 'N', 'PAY_4': 'N', 'PAY_5': 'N', 'PAY_6': 'N', 'BILL_AMT1': 'O', 'BILL_AMT2': 'O', 'BILL_AMT3': 'O', 'BILL_AMT4': 'O', 'BILL_AMT5': 'O', 'BILL_AMT6': 'O', 'PAY_AMT1': 'O', 'PAY_AMT2': 'O', 'PAY_AMT3': 'O', 'PAY_AMT4': 'O', 'PAY_AMT5': 'O', 'PAY_AMT6': 'O', 'default payment next month': 'N'}


<pgmpy.models.DiscreteBayesianNetwork.DiscreteBayesianNetwork at 0x1f666be4cd0>

In [14]:
from pgmpy.estimators import ParameterEstimator
pe = ParameterEstimator(model, df_train)
# print(pe.state_counts('LIMIT_BAL'))
print(pe.state_counts('default payment next month'))

INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'LIMIT_BAL': 'O', 'SEX': 'N', 'EDUCATION': 'N', 'MARRIAGE': 'N', 'AGE': 'O', 'PAY_0': 'N', 'PAY_2': 'N', 'PAY_3': 'N', 'PAY_4': 'N', 'PAY_5': 'N', 'PAY_6': 'N', 'BILL_AMT1': 'O', 'BILL_AMT2': 'O', 'BILL_AMT3': 'O', 'BILL_AMT4': 'O', 'BILL_AMT5': 'O', 'BILL_AMT6': 'O', 'PAY_AMT1': 'O', 'PAY_AMT2': 'O', 'PAY_AMT3': 'O', 'PAY_AMT4': 'O', 'PAY_AMT5': 'O', 'PAY_AMT6': 'O', 'default payment next month': 'N'}


PAY_0                         -2    -1     0     1     2    3   4  5  6  7  8
default payment next month                                                   
0                           1689  3297  8985  1721   576   50  16  8  4  2  7
1                            244   674  1300   895  1311  163  35  5  6  6  6


In [15]:
# Look into CPTs
model.get_cpds()[5].to_dataframe()

PAY_0,-2,-1,0,1,2,3,4,5,6,7,8
PAY_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
-2,0.675616,1.6e-05,1.6e-05,0.324244,1.6e-05,1.6e-05,1.6e-05,1.6e-05,1.6e-05,1.6e-05,1.6e-05
-1,0.032257,0.76264,0.078627,0.104518,0.0219,1e-05,1e-05,1e-05,1e-05,1e-05,1e-05
0,4e-06,0.038509,0.905838,0.000277,0.05535,4e-06,4e-06,4e-06,4e-06,4e-06,4e-06
1,0.001623,0.001623,0.001623,0.983766,0.001623,0.001623,0.001623,0.001623,0.001623,0.001623,0.001623
2,0.001468,0.09771,1.5e-05,0.426025,0.409319,0.065387,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05
3,0.000184,0.147207,0.000184,0.334327,0.214036,0.124931,0.178394,0.000184,0.000184,0.000184,0.000184
4,0.000604,0.044428,0.000604,0.395026,0.161294,0.073645,0.132078,0.190511,0.000604,0.000604,0.000604
5,0.00202,0.148687,0.00202,0.246465,0.00202,0.00202,0.099798,0.00202,0.490909,0.00202,0.00202
6,0.003953,0.003953,0.003953,0.195257,0.003953,0.003953,0.003953,0.003953,0.003953,0.76917,0.003953
7,0.002859,0.002859,0.002859,0.072041,0.002859,0.002859,0.002859,0.002859,0.002859,0.002859,0.90223


#### Perform Inference

In [16]:
inference = VariableElimination(model)

test_data = {'LIMIT_BAL': 2,
 'SEX': 2,
 'EDUCATION': 2,
 'MARRIAGE': 2,
 'AGE': 2,
 'PAY_0': 2,
 'PAY_2': 2,
 'PAY_3': 2,
 'PAY_4': 2,
 'PAY_5': 2,
 'PAY_6': 2,
 'BILL_AMT1': 2,
 'BILL_AMT2': 2,
 'BILL_AMT3': 2,
 'BILL_AMT4': 2,
 'BILL_AMT5': 2,
 'BILL_AMT6': 2,
 'PAY_AMT1': 2,
 'PAY_AMT2': 2,
 'PAY_AMT3': 2,
 'PAY_AMT4': 2,
 'PAY_AMT5': 2,
 'PAY_AMT6': 2}

test_data_1 = {'SEX': 1, 'EDUCATION': 3, 'BILL_AMT1': 1, 'PAY_AMT1': 2}
test_data_2 = {'EDUCATION': 4, 'PAY_2': 6, 'BILL_AMT1': 9, 'PAY_AMT1': 1}

prediction = inference.map_query(variables=['default payment next month'], evidence=test_data)
prediction_1 = inference.map_query(variables=['default payment next month'], evidence=test_data_1)
prediction_2 = inference.map_query(variables=['default payment next month'], evidence=test_data_2)

print("Predicted Credit Score:", prediction['default payment next month'])
print("Predicted Credit Score for Example 1:", prediction_1['default payment next month'])
print("Predicted Credit Score for Example 2:", prediction_2['default payment next month'])

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Predicted Credit Score: 1
Predicted Credit Score for Example 1: 0
Predicted Credit Score for Example 2: 1


In [17]:
X_predict = model.predict(X_test)
y_predict = X_predict['default payment next month']
y_predict

  data_unique_indexes = data.groupby(list(data.columns), dropna=False).apply(


  0%|          | 0/8821 [00:00<?, ?it/s]

2        0
4        0
12       0
15       0
18       0
        ..
29986    0
29989    0
29994    0
29995    1
29997    0
Name: default payment next month, Length: 9000, dtype: int32

In [18]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score, classification_report
print(f'Accuracy: {accuracy_score(y_test, y_predict)}\n')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_predict)}\n')
print(f'Classification report: \n{classification_report(y_test, y_predict)}\n')
print(f'F1 Score: \n{f1_score(y_test, y_predict)}\n')
print(f'ROC-AUC Score: {roc_auc_score(y_test, y_predict)}')

Accuracy: 0.7165555555555555

Confusion Matrix: 
[[6254  755]
 [1796  195]]

Classification report: 
              precision    recall  f1-score   support

           0       0.78      0.89      0.83      7009
           1       0.21      0.10      0.13      1991

    accuracy                           0.72      9000
   macro avg       0.49      0.50      0.48      9000
weighted avg       0.65      0.72      0.68      9000


F1 Score: 
0.13260795647738866

ROC-AUC Score: 0.49511104292328756
