# Generic Rule Induction Notebook

Continuoulsy refined.

## README

- BRCG runs with proper (conda) aix360 environment
- Use aix360i environment for RIPPER

### Configuration


In [11]:
from config import config_dict
from config_copy import config_dict_imbalanced
from config_copy import Config_list

# document config order
#CONFIG = config_dict_imbalanced["CONFIG-I2"]
#CONFIG = config_dict["CONFIG14"]

#print('Proceed with configuration:', CONFIG["NAME"])

In [12]:
import pandas as pd
import numpy as np
import seaborn as sns
# import os
from sklearn.model_selection import train_test_split #, GridSearchCV
from sklearn.metrics import matthews_corrcoef,fbeta_score,confusion_matrix,f1_score,precision_score, recall_score, accuracy_score, balanced_accuracy_score, confusion_matrix, r2_score, explained_variance_score, mean_absolute_error, max_error
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
import wittgenstein as lw
import time
import warnings
import re

from aix360.algorithms.rbm import BRCGExplainer, BooleanRuleCG, GLRMExplainer, LinearRuleRegression

from aix360.algorithms.rbm import FeatureBinarizer

from aix360.algorithms.rbm import FeatureBinarizerFromTrees

from aix360i.algorithms.rule_induction.ripper import Ripper

from corels import *




### Data

In [13]:
from collections import defaultdict
for config in config_dict_imbalanced:
    print(config)

CONFIG-I1
CONFIG-I2
CONFIG-I3
CONFIG-I4
CONFIG-I5
CONFIG-I6
CONFIG-I7
CONFIG-I8


In [14]:

metric_dict = {}
metric_list = []  
for config in config_dict_imbalanced:
    CONFIG = config_dict_imbalanced[config]
    def convert(char):
        if char == CONFIG['POS_CLASS']:
            return 1
        else:
            return 0

    df = pd.read_csv(CONFIG['DATA_SET'],dtype=CONFIG['DATA_TYPES'])
    df = df.drop(columns=CONFIG['DROP'])
    if CONFIG['ALGO'] == 'BRCG':
        df[CONFIG['TARGET_LABEL']] = df[CONFIG['TARGET_LABEL']].map(convert)
        CONFIG['POS_CLASS'] = 1
    # maybe this could also be achieved through explicit binarization of target vector
    df.info()
    df[CONFIG['TARGET_LABEL']].value_counts()

    # train test split

    if CONFIG['TRAIN_TEST_SPLIT'] == 'FIXED':
        if CONFIG['MODE'] == 'PREDICTIVE':
            train = df[df['is_test_set'] == False]
            test = df[df['is_test_set'] == True]
        elif CONFIG['MODE'] == 'DESCRIPTIVE':
            train = df
            test = df

        train = train.drop(columns=['is_test_set'])
        test = test.drop(columns=['is_test_set'])

        y_train = train[CONFIG['TARGET_LABEL']]
        x_train = train.drop(columns=[CONFIG['TARGET_LABEL']])

        y_test = test[CONFIG['TARGET_LABEL']]
        x_test = test.drop(columns=[CONFIG['TARGET_LABEL']])
    else:
        x_train, x_test, y_train, y_test = train_test_split(df.drop(columns=[CONFIG['TARGET_LABEL']]), df[CONFIG['TARGET_LABEL']], test_size=CONFIG['TRAIN_TEST_SPLIT'], random_state=42)

    print('Training:', x_train.shape, y_train.shape)
    print('Test:', x_test.shape, y_test.shape)

    for i in CONFIG["BINARIZER"]:
        
        
        if i == "TREES":
            binarizer =  FeatureBinarizerFromTrees(negations=True, randomState=42) 
            binarizer = binarizer.fit(x_train, y_train)
            x_train_bin = binarizer.transform(x_train) 
            x_test_bin = binarizer.transform(x_test)
            for algo in CONFIG["ALGO"]:
                präds = []
                if algo == 'RIPPER':
                    
                    prec_t_ripper = []
                    rec_t_ripper = []
                    rip_t_rl = []
                    
                    # start time
                    start_time = time.time()
                    estimator = Ripper()
                    estimator.fit(x_train_bin, y_train, pos_value=CONFIG['POS_CLASS'])
                    end_time = time.time()
                    y_pred = estimator.predict(x_test_bin)
                    print('------------------------------------------------------') 
                    print('RIPPER TREES')
                    print('Accuracy:', accuracy_score(y_test, y_pred))
                    #print('Balanced accuracy:', balanced_accuracy_score(y_test, y_pred))
                    print('Precision:', precision_score(y_test, y_pred))
                    print('Recall:', recall_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS']))
                    ripper_t_bacc = balanced_accuracy_score(y_test, y_pred, adjusted=True)
                    ripper_t_f2 = fbeta_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS'], beta= 2)
                    ripper_t_acc = accuracy_score(y_test, y_pred)
                    #print('F1', f1_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS']))
                    #print('ConfusionMatrix', confusion_matrix(y_test, y_pred))
                    print('F-2', fbeta_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS'], beta= 2))
                    #print('Mathhews', matthews_corrcoef(y_test, y_pred))
                    print('Rule count: ' + str(sum([len(rules) for rules in estimator.rule_map.values()])))
                    ripper_t_rl = str(sum([len(rules) for rules in estimator.rule_map.values()]))
                    #for i in range(len(estimator.rule_map[0])):
                    #    präds.append(len(estimator.rule_map[0][i]))
                    #print("Sum Prädikate:", sum(präds))
                    #print("MAX Prädikate:", max(präds))
                    #prec_t_ripper.append(precision_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS']))
                    #rec_t_ripper.append(recall_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS']))
                    #rip_t_rl.append(str(sum([len(rules) for rules in estimator.rule_map.values()])))
                    
                    print('------------------------------------------------------')
                if algo == 'BRCG':
                    start_time = time.time()
                    estimator = BooleanRuleCG()
                    with warnings.catch_warnings():
                        warnings.simplefilter("ignore")
                        
                        estimator.fit(x_train_bin, y_train)
                        end_time = time.time()
                        y_pred = estimator.predict(x_test_bin)
                    print('------------------------------------------------------') 
                    print('BRCG TREES')
                    print('Accuracy:', accuracy_score(y_test, y_pred))
                    print('Balanced accuracy:', balanced_accuracy_score(y_test, y_pred))
                    print('Precision:', precision_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS']))
                    print('Recall:', recall_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS']))
                    brcg_t_bacc = balanced_accuracy_score(y_test, y_pred, adjusted=True)
                    brcg_t_f2 = fbeta_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS'], beta= 2)
                    brcg_t_acc = accuracy_score(y_test, y_pred)
                    print('F1', f1_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS']))
                    print('ConfusionMatrix', confusion_matrix(y_test, y_pred))
                    print('F-2', fbeta_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS'], beta= 2))
                    print('Mathhews', matthews_corrcoef(y_test, y_pred))
                    model = estimator.explain()
                    print('Number of rules:', len(model['rules']))
                    brcg_t_rl = len(model['rules']) 
                    print('------------------------------------------------------')

                if algo == 'CORELS':
                    start_time = time.time()
                    estimator = CorelsClassifier(n_iter=10000, 
                        max_card=2, # feautres per statement
                        c = 0.0001 # Higher values penalise longer rulelists
                        )
                    estimator.fit(x_train_bin, y_train , prediction_name = CONFIG["TARGET_LABEL"])
                    end_time = time.time()
                    y_pred = estimator.predict(x_test_bin)
                    print('------------------------------------------------------') 
                    print('CORELS TREES')
                    print('Accuracy:', accuracy_score(y_test, y_pred))
                    print('Balanced accuracy:', balanced_accuracy_score(y_test, y_pred))
                    print('Precision:', precision_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS']))
                    print('Recall:', recall_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS']))
                    corels_t_bacc = balanced_accuracy_score(y_test, y_pred, adjusted=True)
                    corels_t_f2 = fbeta_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS'], beta= 2)
                    corels_t_acc = accuracy_score(y_test, y_pred)
                    print('F1', f1_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS']))
                    print('ConfusionMatrix', confusion_matrix(y_test, y_pred))
                    print('F-2', fbeta_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS'], beta= 2))
                    print('Mathhews', matthews_corrcoef(y_test, y_pred))
                    r_length = len(estimator.rl().rules)
                    corels_t_rl = len(estimator.rl().rules)
                    print("Rule Length:", r_length)
                    print('------------------------------------------------------')

        if i == "QUANTILE":
            binarizer =  FeatureBinarizer(numThresh=9,negations=True, randomState=42) 
            binarizer = binarizer.fit(x_train)
            x_train_bin = binarizer.transform(x_train) 
            x_test_bin = binarizer.transform(x_test)  

            for algo in CONFIG["ALGO"]:
                präds = []

                if algo == 'RIPPER':
                    start_time = time.time()
                    estimator = Ripper()
                    estimator.fit(x_train_bin, y_train, pos_value=CONFIG['POS_CLASS'])
                    end_time = time.time()
                    y_pred = estimator.predict(x_test_bin)
                    print('------------------------------------------------------') 
                    print('RIPPER QUANTILE')
                    print('Accuracy:', accuracy_score(y_test, y_pred))
                    print('Balanced accuracy:', balanced_accuracy_score(y_test, y_pred))
                    print('Precision:', precision_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS']))
                    print('Recall:', recall_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS']))
                    ripper_q_bacc = balanced_accuracy_score(y_test, y_pred, adjusted=True)
                    ripper_q_f2 = fbeta_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS'], beta= 2)
                    ripper_q_acc = accuracy_score(y_test, y_pred)
                    print('ConfusionMatrix', confusion_matrix(y_test, y_pred))
                    print('F-2', fbeta_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS'], beta= 2))
                    print('Mathhews', matthews_corrcoef(y_test, y_pred))
                    print('Rule count: ' + str(sum([len(rules) for rules in estimator.rule_map.values()])))
                    ripper_q_rl = str(sum([len(rules) for rules in estimator.rule_map.values()]))
                    #for i in range(len(estimator.rule_map[0])):
                    #    präds.append(len(estimator.rule_map[0][i]))
                    #print("Sum Prädikate:", sum(präds))
                    #print("MAX Prädikate:", max(präds))
                    
                    print('------------------------------------------------------')
                if algo == 'BRCG':
                    start_time = time.time()
                    estimator = BooleanRuleCG()
                    with warnings.catch_warnings():
                        warnings.simplefilter("ignore")
                        estimator.fit(x_train_bin, y_train)
                        end_time = time.time()
                    y_pred = estimator.predict(x_test_bin)
                    print('------------------------------------------------------')  
                    print('BRCG QUANTILE')
                    print('Accuracy:', accuracy_score(y_test, y_pred))
                    print('Balanced accuracy:', balanced_accuracy_score(y_test, y_pred))
                    print('Precision:', precision_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS']))
                    print('Recall:', recall_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS']))
                    brcg_q_bacc = balanced_accuracy_score(y_test, y_pred, adjusted=True)
                    brcg_q_f2 = fbeta_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS'], beta= 2)
                    brcg_q_acc = accuracy_score(y_test, y_pred)
                    print('ConfusionMatrix', confusion_matrix(y_test, y_pred))
                    print('F-2', fbeta_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS'], beta= 2))
                    print('Mathhews', matthews_corrcoef(y_test, y_pred))
                    model = estimator.explain()
                
                    print('Number of rules:', len(model['rules']))
                    brcg_q_rl = len(model['rules'])
                    print('------------------------------------------------------')

                if algo == 'CORELS':
                    start_time = time.time()
                    estimator = CorelsClassifier(n_iter=10000, 
                        max_card=2, # feautres per statement
                        c = 0.0001 # Higher values penalise longer rulelists
                        )
                    estimator.fit(x_train_bin, y_train , prediction_name = CONFIG["TARGET_LABEL"])
                    end_time = time.time()
                    y_pred = estimator.predict(x_test_bin)
                    print('------------------------------------------------------') 
                    print('CORELS QUANTILE')
                    print('Accuracy:', accuracy_score(y_test, y_pred))
                    print('Balanced accuracy:', balanced_accuracy_score(y_test, y_pred))
                    print('Precision:', precision_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS']))
                    print('Recall:', recall_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS']))
                    corels_q_bacc = balanced_accuracy_score(y_test, y_pred, adjusted=True)
                    corels_q_f2 = fbeta_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS'], beta= 2)
                    corels_q_acc = accuracy_score(y_test, y_pred)
                    print('ConfusionMatrix', confusion_matrix(y_test, y_pred))
                    print('F-2', fbeta_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS'], beta= 2))
                    print('Mathhews', matthews_corrcoef(y_test, y_pred))
                    r_length = len(estimator.rl().rules)
                    corels_q_rl = len(estimator.rl().rules)
                    print("Rule Length:", r_length)
                    print('------------------------------------------------------')   
                

        if i == "NATIVE":
            x_train_bin = x_train
            x_test_bin = x_test

            start_time = time.time()
            estimator = Ripper()
            estimator.fit(x_train_bin, y_train, pos_value=CONFIG['POS_CLASS'])
            end_time = time.time()
            y_pred = estimator.predict(x_test_bin)
            print('------------------------------------------------------') 
            print('RIPPER NATIVE')
            print('Accuracy:', accuracy_score(y_test, y_pred))
            print('Balanced accuracy:', balanced_accuracy_score(y_test, y_pred))
            #print('Precision:', precision_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS']))
            #print('Recall:', recall_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS']))
            ripper_n_bacc = balanced_accuracy_score(y_test, y_pred, adjusted=True)
            ripper_n_f2 = fbeta_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS'], beta= 2)
            ripper_n_acc = accuracy_score(y_test, y_pred)
            #print('ConfusionMatrix', confusion_matrix(y_test, y_pred))
            print('F-2', fbeta_score(y_test, y_pred, pos_label=CONFIG['POS_CLASS'], beta= 2))
            print('Mathhews', matthews_corrcoef(y_test, y_pred))
            print('Rule count: ' + str(sum([len(rules) for rules in estimator.rule_map.values()])))
            ripper_n_rl = str(sum([len(rules) for rules in estimator.rule_map.values()]))
            #for i in range(len(estimator.rule_map[0])):
            #    präds.append(len(estimator.rule_map[0][i]))
            #print("Sum Prädikate:", sum(präds))
            #print("MAX Prädikate:", max(präds))

    
    metric_dict.update({config:{"Config":config,"ripper_t_bacc":ripper_t_bacc, "ripper_t_f2": ripper_t_f2,"ripper_t_acc":ripper_t_acc,"ripper_t_rl":ripper_t_rl, 
                                                "brcg_t_bacc": brcg_t_bacc, "brcg_t_f2": brcg_t_f2,"brcg_t_acc":brcg_t_acc,"brcg_t_rl":brcg_t_rl,
                                                "corels_t_bacc":corels_t_bacc, "corels_t_f2": corels_t_f2,"corels_t_acc":corels_t_acc,"corels_t_rl":corels_t_rl,
                                                "ripper_q_bacc":ripper_q_bacc, "ripper_q_f2":ripper_q_f2,"ripper_q_acc":ripper_q_acc,"ripper_q_rl":ripper_q_rl, 
                                                "brcg_q_bacc":brcg_q_bacc,"brcg_q_f2":brcg_q_f2,"brcg_q_acc":brcg_q_acc,"brcg_q_rl":brcg_q_rl,
                                                "corels_q_bacc":corels_q_bacc,"corels_q_f2":corels_q_f2,"corels_q_acc": corels_q_bacc,"corels_q_rl":corels_q_rl,
                                                "ripper_n_bacc": ripper_n_bacc, "ripper_n_f2": ripper_n_f2,"ripper_n_acc": ripper_n_bacc, "ripper_n_rl":ripper_n_rl}})
    metric_list.append(metric_dict[config])
   

        
            

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Status Checking          1000 non-null   object 
 1   Duration in Month        1000 non-null   float64
 2   Credit History           1000 non-null   object 
 3   Purpose                  1000 non-null   object 
 4   Credit Amount            1000 non-null   float64
 5   Savings Account          1000 non-null   object 
 6   Employement since        1000 non-null   object 
 7   Installmentrate %        1000 non-null   float64
 8   StatusSex                1000 non-null   object 
 9   Otherdebtos              1000 non-null   object 
 10  PresentResidence         1000 non-null   float64
 11  Property                 1000 non-null   object 
 12  Age in years             1000 non-null   float64
 13  Otherinstallment Plans   1000 non-null   object 
 14  Housing                  

  _warn_prf(average, modifier, msg_start, len(result))


Initial LP solved
------------------------------------------------------
BRCG TREES
Accuracy: 0.9984082955888721
Balanced accuracy: 0.5
Precision: 0.0
Recall: 0.0
F1 0.0


  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


ConfusionMatrix [[85307     0]
 [  136     0]]
F-2 0.0
Mathhews 0.0
Number of rules: 0
------------------------------------------------------
RULELIST:
if [feature14 && not feature1]:
  Class = False
else if [feature2 && feature11]:
  Class = False
else 
  Class = True
------------------------------------------------------
CORELS TREES
Accuracy: 0.9992626663389628
Balanced accuracy: 0.8564769786494315
Precision: 0.8016528925619835
Recall: 0.7132352941176471
F1 0.7548638132295721
ConfusionMatrix [[85283    24]
 [   39    97]]
F-2 0.7293233082706767
Mathhews 0.7557881426776812
Rule Length: 3
------------------------------------------------------
------------------------------------------------------
RIPPER QUANTILE
Accuracy: 0.9994733330992591
Balanced accuracy: 0.8749355269790287
Precision: 0.9026548672566371
Recall: 0.75
ConfusionMatrix [[85296    11]
 [   34   102]]
F-2 0.7762557077625569
Mathhews 0.8225423808233961
Rule count: 7
------------------------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


Mathhews 0.0
Number of rules: 0
------------------------------------------------------
RULELIST:
Class = False
------------------------------------------------------
CORELS QUANTILE
Accuracy: 0.9984082955888721
Balanced accuracy: 0.5
Precision: 0.0
Recall: 0.0
ConfusionMatrix [[85307     0]
 [  136     0]]
F-2 0.0
Mathhews 0.0
Rule Length: 1
------------------------------------------------------
------------------------------------------------------
RIPPER NATIVE
Accuracy: 0.9994499256814484
Balanced accuracy: 0.8675825858025581
F-2 0.7633587786259544
Mathhews 0.8136343237334008
Rule count: 3
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Amount            30000 non-null  float64
 1   Sex               30000 non-null  object 
 2   Education         30000 non-null  object 
 3   MaritalStatus     30000 non-null  object 
 4   Age 

  _warn_prf(average, modifier, msg_start, len(result))


Initial LP solved
Iteration: 1, Objective: 0.1865
------------------------------------------------------
BRCG TREES
Accuracy: 0.8204444444444444
Balanced accuracy: 0.6359838821892393
Precision: 0.6981566820276498
Recall: 0.30918367346938774
F1 0.4285714285714286
ConfusionMatrix [[6778  262]
 [1354  606]]
F-2 0.347955902618282
Mathhews 0.3802535252391635
Number of rules: 2
------------------------------------------------------
RULELIST:
if [not feature6 && not feature9]:
  DefaultNextMonth = False
else if [feature27 && feature28]:
  DefaultNextMonth = False
else 
  DefaultNextMonth = True
------------------------------------------------------
CORELS TREES
Accuracy: 0.8206666666666667
Balanced accuracy: 0.6361259276437847
Precision: 0.6997690531177829
Recall: 0.30918367346938774
F1 0.4288747346072187
ConfusionMatrix [[6780  260]
 [1354  606]]
F-2 0.34803583735354926
Mathhews 0.381043171453886
Rule Length: 3
------------------------------------------------------
--------------------------

  _warn_prf(average, modifier, msg_start, len(result))


------------------------------------------------------
BRCG TREES
Accuracy: 0.9907878017789072
Balanced accuracy: 0.901242116705556
Precision: 0.991304347826087
Recall: 0.8028169014084507
F1 0.8871595330739299
ConfusionMatrix [[3005    1]
 [  28  114]]
F-2 0.8345534407027819
Mathhews 0.8877486968544535
Number of rules: 2
------------------------------------------------------
RULELIST:
if [not feature6 && not feature13]:
  Class = True
else if [feature3 && feature7]:
  Class = True
else 
  Class = False
------------------------------------------------------
CORELS TREES
Accuracy: 0.9907878017789072
Balanced accuracy: 0.901242116705556
Precision: 0.991304347826087
Recall: 0.8028169014084507
F1 0.8871595330739299
ConfusionMatrix [[3005    1]
 [  28  114]]
F-2 0.8345534407027819
Mathhews 0.8877486968544535
Rule Length: 3
------------------------------------------------------
------------------------------------------------------
RIPPER QUANTILE
Accuracy: 0.9907878017789072
Balanced accurac

  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


------------------------------------------------------
RIPPER QUANTILE
Accuracy: 0.976027397260274
Balanced accuracy: 0.704371275841588
Precision: 0.6938775510204082
Recall: 0.4146341463414634
ConfusionMatrix [[2531   15]
 [  48   34]]
F-2 0.4509283819628647
Mathhews 0.525366366968729
Rule count: 3
------------------------------------------------------
Learning DNF rule with complexity parameters lambda0=0.001, lambda1=0.001
Initial LP solved
------------------------------------------------------
BRCG QUANTILE
Accuracy: 0.9687975646879756
Balanced accuracy: 0.5
Precision: 0.0
Recall: 0.0
ConfusionMatrix [[2546    0]
 [  82    0]]
F-2 0.0
Mathhews 0.0
Number of rules: 0
------------------------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


RULELIST:
Target = False
------------------------------------------------------
CORELS QUANTILE
Accuracy: 0.9687975646879756
Balanced accuracy: 0.5
Precision: 0.0
Recall: 0.0
ConfusionMatrix [[2546    0]
 [  82    0]]
F-2 0.0
Mathhews 0.0
Rule Length: 1
------------------------------------------------------
------------------------------------------------------
RIPPER NATIVE
Accuracy: 0.9756468797564688
Balanced accuracy: 0.6864713658919779
F-2 0.4166666666666667
Mathhews 0.5053656049560805
Rule count: 5
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15420 entries, 0 to 15419
Data columns (total 33 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Month                 15420 non-null  object 
 1   WeekOfMonth           15420 non-null  float64
 2   DayOfWeek             15420 non-null  object 
 3   Make                  15420 non-null  object 
 4   AccidentArea          15420 non-null  object 
 5   DayOfWeekClaimed   

  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


------------------------------------------------------
RIPPER QUANTILE
Accuracy: 0.9390402075226978
Balanced accuracy: 0.5069023630257399
Precision: 0.8
Recall: 0.014035087719298246
ConfusionMatrix [[4340    1]
 [ 281    4]]
F-2 0.017467248908296942
Mathhews 0.10101645629319606
Rule count: 1
------------------------------------------------------
Learning DNF rule with complexity parameters lambda0=0.001, lambda1=0.001
Initial LP solved
------------------------------------------------------
BRCG QUANTILE
Accuracy: 0.9383916990920882
Balanced accuracy: 0.5
Precision: 0.0
Recall: 0.0
ConfusionMatrix [[4341    0]
 [ 285    0]]
F-2 0.0
Mathhews 0.0
Number of rules: 0
------------------------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


RULELIST:
FraudFound_P = False
------------------------------------------------------
CORELS QUANTILE
Accuracy: 0.9383916990920882
Balanced accuracy: 0.5
Precision: 0.0
Recall: 0.0
ConfusionMatrix [[4341    0]
 [ 285    0]]
F-2 0.0
Mathhews 0.0
Rule Length: 1
------------------------------------------------------
------------------------------------------------------
RIPPER NATIVE
Accuracy: 0.9399048854301773
Balanced accuracy: 0.5253943428024103
F-2 0.0644883920894239
Mathhews 0.17362005562966298
Rule count: 4


KeyError: "['name'] not found in axis"

In [15]:
#metric_dict
metric_list

[{'Config': 'CONFIG-I1',
  'ripper_t_bacc': 0.21652032178347969,
  'ripper_t_f2': 0.6625441696113072,
  'ripper_t_acc': 0.5233333333333333,
  'ripper_t_rl': '17',
  'brcg_t_bacc': 0.15547610284452396,
  'brcg_t_f2': 0.36613272311212813,
  'brcg_t_acc': 0.6666666666666666,
  'brcg_t_rl': 3,
  'corels_t_bacc': 0.1527945738472054,
  'corels_t_f2': 0.25125628140703515,
  'corels_t_acc': 0.7166666666666667,
  'corels_t_rl': 3,
  'ripper_q_bacc': 0.2215679057784321,
  'ripper_q_f2': 0.3414634146341464,
  'ripper_q_acc': 0.73,
  'ripper_q_rl': '3',
  'brcg_q_bacc': 0.30984804669015187,
  'brcg_q_f2': 0.4875283446712018,
  'brcg_q_acc': 0.7266666666666667,
  'brcg_q_rl': 6,
  'corels_q_bacc': 0.14180556285819446,
  'corels_q_f2': 0.23929471032745592,
  'corels_q_acc': 0.14180556285819446,
  'corels_q_rl': 2,
  'ripper_n_bacc': 0.17903149482096836,
  'ripper_n_f2': 0.3056234718826406,
  'ripper_n_acc': 0.17903149482096836,
  'ripper_n_rl': '3'},
 {'Config': 'CONFIG-I2',
  'ripper_t_bacc': 0.0,


In [None]:
type(config_dict_imbalanced[config])

dict

In [20]:
df_list = []
csv_list = []
for i in range(len(Config_list)):
    
    if Config_list[i][1]['TYPE'] == "BINARY":
        if Config_list[i][1]["DATA_SET"] not in csv_list:
            temp_df = pd.read_csv(Config_list[i][1]["DATA_SET"])
            csv_list.append(Config_list[i][1]["DATA_SET"])
            #temp_df = temp_df.drop(columns=Config_list[i][1]['DROP'])
            temp_df= temp_df.rename(columns={temp_df[Config_list[i][1]['TARGET_LABEL']].name : 'TARGET_LABEL'})
            df_list.append(temp_df)
        
eval_df = pd.DataFrame(csv_list, columns=['Data_Set'])
eval_df["Target_1_pos"] = pd.Series('int32')
eval_df["Target_2_neg"] = pd.Series('int32')
eval_df["IB_Ratio"] = pd.Series()
eval_df["Num_Feautures"] = pd.Series('int32')
eval_df["Cat_Feautures"] = pd.Series('int32')
eval_df["Size_row"] = pd.Series('int32')
eval_df["Size_col"] = pd.Series('int32')

# Trees
eval_df["ripper_t_bacc"] = pd.Series()
eval_df["ripper_t_f2"] = pd.Series()
eval_df["ripper_t_acc"] = pd.Series()
eval_df["ripper_t_rl"] = pd.Series()

eval_df["brcg_t_bacc"] = pd.Series()
eval_df["brcg_t_f2"] = pd.Series()
eval_df["brcg_t_acc"] = pd.Series()
eval_df["brcg_t_rl"] = pd.Series()

eval_df["corels_t_bacc"] = pd.Series()
eval_df["corels_t_f2"] = pd.Series()
eval_df["corels_t_acc"] = pd.Series()
eval_df["corels_t_rl"] = pd.Series()

# Quantile
eval_df["ripper_q_bacc"] = pd.Series()
eval_df["ripper_q_f2"] = pd.Series()
eval_df["ripper_q_acc"] = pd.Series()
eval_df["ripper_q_rl"] = pd.Series()

eval_df["brcg_q_bacc"] = pd.Series()
eval_df["brcg_q_f2"] = pd.Series()
eval_df["brcg_q_acc"] = pd.Series()
eval_df["brcg_q_rl"] = pd.Series()

eval_df["corels_q_bacc"] = pd.Series()
eval_df["corels_q_f2"] = pd.Series()
eval_df["corels_q_acc"] = pd.Series()
eval_df["corels_q_rl"] = pd.Series()

# Ripper Native
eval_df["ripper_n_bacc"] = pd.Series()
eval_df["ripper_n_f2"] = pd.Series()
eval_df["ripper_n_acc"] = pd.Series()
eval_df["ripper_n_rl"] = pd.Series()

  app.launch_new_instance()


In [21]:
eval_df

Unnamed: 0,Data_Set,Target_1_pos,Target_2_neg,IB_Ratio,Num_Feautures,Cat_Feautures,Size_row,Size_col,ripper_t_bacc,ripper_t_f2,...,brcg_q_acc,brcg_q_rl,corels_q_bacc,corels_q_f2,corels_q_acc,corels_q_rl,ripper_n_bacc,ripper_n_f2,ripper_n_acc,ripper_n_rl
0,../data/german_credit_codiert.csv,int32,int32,,int32,int32,int32,int32,,,...,,,,,,,,,,
1,../data/fraud_detection.csv,,,,,,,,,,...,,,,,,,,,,
2,../data/TaiwanCreditData.csv,,,,,,,,,,...,,,,,,,,,,
3,../data/miniloan-decisions-100K.csv,,,,,,,,,,...,,,,,,,,,,
4,../data/fraud_detection_duenn.csv,,,,,,,,,,...,,,,,,,,,,
5,../data/binary_bike_imbalanced.csv,,,,,,,,,,...,,,,,,,,,,
6,../data/fraud_oracle_clean.csv,,,,,,,,,,...,,,,,,,,,,
7,../data/miniloan_duenn.csv,,,,,,,,,,...,,,,,,,,,,


In [23]:
for frame in range(len(df_list)):
    
    metric = df_list[frame]["TARGET_LABEL"].value_counts()

    # Imbalanced Ratio = minor class
    if metric[0] > metric[1]:
        eval_df["Target_1_pos"].iloc[frame] = metric[1]
        eval_df["Target_2_neg"].iloc[frame] = metric[0]
    else:
        eval_df["Target_1_pos"].iloc[frame] = metric[0]
        eval_df["Target_2_neg"].iloc[frame] = metric[1]
        
    df_size_row = len(df_list[frame])
    df_size_col = len(df_list[frame].columns)
    df_num_feauture =  len(df_list[frame].select_dtypes(include=['int64', 'float64']).columns)
    df_cat_feauture =   len(df_list[frame].select_dtypes(include=['object']).columns)

    eval_df["IB_Ratio"].iloc[frame] = eval_df["Target_1_pos"].iloc[frame]/eval_df["Target_2_neg"].iloc[frame]
    eval_df["Size_row"].iloc[frame]   = df_size_row
    eval_df["Size_col"].iloc[frame]   = df_size_col
    
    eval_df["Num_Feautures"].iloc[frame] = df_num_feauture
    eval_df["Cat_Feautures"].iloc[frame] = df_cat_feauture

    # adding Metrics Trees
    eval_df["ripper_t_bacc"].iloc[frame] = metric_list[frame]["ripper_t_bacc"]
    eval_df["ripper_t_f2"].iloc[frame] = metric_list[frame]["ripper_t_f2"]
    eval_df["ripper_t_acc"].iloc[frame] = metric_list[frame]["ripper_t_acc"]
    eval_df["ripper_t_rl"].iloc[frame] = metric_list[frame]["ripper_t_rl"]


    eval_df["brcg_t_bacc"].iloc[frame] = metric_list[frame]["brcg_t_bacc"]
    eval_df["brcg_t_f2"].iloc[frame] = metric_list[frame]["brcg_t_f2"]
    eval_df["brcg_t_acc"].iloc[frame] = metric_list[frame]["brcg_t_acc"]
    eval_df["brcg_t_rl"].iloc[frame] = metric_list[frame]["brcg_t_rl"]

    eval_df["corels_t_bacc"].iloc[frame] = metric_list[frame]["corels_t_bacc"]
    eval_df["corels_t_f2"].iloc[frame] = metric_list[frame]["corels_t_f2"]
    eval_df["corels_t_acc"].iloc[frame] = metric_list[frame]["corels_t_acc"]
    eval_df["corels_t_rl"].iloc[frame] = metric_list[frame]["corels_t_rl"]

    # adding Metrics Qunatile
    eval_df["ripper_q_bacc"].iloc[frame] =  metric_list[frame]["ripper_q_bacc"]
    eval_df["ripper_q_f2"].iloc[frame] = metric_list[frame]["ripper_q_f2"]
    eval_df["ripper_q_acc"].iloc[frame] = metric_list[frame]["ripper_q_acc"]
    eval_df["ripper_q_rl"].iloc[frame] = metric_list[frame]["ripper_q_rl"]

    eval_df["brcg_q_bacc"].iloc[frame] = metric_list[frame]["brcg_q_bacc"]
    eval_df["brcg_q_f2"].iloc[frame] = metric_list[frame]["brcg_q_f2"]
    eval_df["brcg_q_acc"].iloc[frame] = metric_list[frame]["brcg_q_acc"]
    eval_df["brcg_q_rl"].iloc[frame] = metric_list[frame]["brcg_q_rl"]
    

    eval_df["corels_q_bacc"].iloc[frame] =  metric_list[frame]["corels_q_bacc"]
    eval_df["corels_q_f2"].iloc[frame] =  metric_list[frame]["corels_q_f2"]
    eval_df["corels_q_acc"].iloc[frame] =  metric_list[frame]["corels_q_acc"]
    eval_df["corels_q_rl"].iloc[frame] =  metric_list[frame]["corels_q_rl"]

    # adding Metrics Ripper Native
    eval_df["ripper_n_bacc"].iloc[frame] = metric_list[frame]["ripper_n_bacc"]
    eval_df["ripper_n_f2"].iloc[frame] = metric_list[frame]["ripper_n_f2"]
    eval_df["ripper_n_acc"].iloc[frame] = metric_list[frame]["ripper_n_acc"]
    eval_df["ripper_n_rl"].iloc[frame] = metric_list[frame]["ripper_n_rl"]


eval_df

IndexError: list index out of range

In [27]:
eval_df.to_csv("test3.csv",sep =",")

In [None]:
CONFIG['POS_CLASS']

False

In [None]:
y_test.value_counts()

True     231
False     69
Name: approval, dtype: int64

In [None]:
CONFIG['POS_CLASS']

False

In [None]:
df[CONFIG["TARGET_LABEL"]].value_counts()

True     781
False    219
Name: approval, dtype: int64