In [4]:
import pandas as pd
import numpy as np
import pickle
from utils import *
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, log_loss
from sklearn.model_selection import StratifiedKFold, cross_validate
from slice_finder import SliceFinder
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [5]:
df_bank = pd.read_csv(
    "sets/bank/bank-full.csv",
        names=["age","job","marital","education","default","balance","housing","loan","contact","day",
        "month","duration","campaign","pdays","previous","poutcome","Target"],
        sep=r';',
        engine='python',
        na_values="?")

df_bank = df_bank.dropna()

# Encode categorical features
encoders = {}
for column in df_bank.columns:
    if df_bank.dtypes[column] == np.object_:
        le = LabelEncoder()
        df_bank[column] = le.fit_transform(df_bank[column])
        encoders[column] = le
        print(column, le.classes_, le.transform(le.classes_))

X, y = df_bank[df_bank.columns.difference(["Target"])], df_bank["Target"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

job ['admin.' 'blue-collar' 'entrepreneur' 'housemaid' 'management' 'retired'
 'self-employed' 'services' 'student' 'technician' 'unemployed' 'unknown'] [ 0  1  2  3  4  5  6  7  8  9 10 11]
marital ['divorced' 'married' 'single'] [0 1 2]
education ['primary' 'secondary' 'tertiary' 'unknown'] [0 1 2 3]
default ['no' 'yes'] [0 1]
housing ['no' 'yes'] [0 1]
loan ['no' 'yes'] [0 1]
contact ['cellular' 'telephone' 'unknown'] [0 1 2]
month ['apr' 'aug' 'dec' 'feb' 'jan' 'jul' 'jun' 'mar' 'may' 'nov' 'oct' 'sep'] [ 0  1  2  3  4  5  6  7  8  9 10 11]
poutcome ['failure' 'other' 'success' 'unknown'] [0 1 2 3]
Target ['no' 'yes'] [0 1]


In [None]:
lr = RandomForestClassifier(max_depth=5, n_estimators=10)
lr.fit(X_train, y_train)

sf = SliceFinder(lr, (X_train, y_train))
metrics_all = sf.evaluate_model((X_train,y_train))
reference = (np.mean(metrics_all), np.std(metrics_all), len(metrics_all))

In [None]:
name = 'Bank.p'
recommendations = sf.find_slice(k=100, epsilon=0.4, degree=2, max_workers=4, name=name)

for s in recommendations:
    print ('\n=====================\nSlice description:')
    for k, v in list(s.filters.items()):
        values = ''
        if k in encoders:
            le = encoders[k]
            for v_ in v:
                values += '%s '%(le.inverse_transform(v_)[0])
        else:
            for v_ in sorted(v, key=lambda x: x[0]):
                if len(v_) > 1:
                    values += '%s ~ %s'%(v_[0], v_[1])
                else:
                    values += '%s '%(v_[0])
        print ('%s:%s'%(k, values))
    print ('---------------------\neffect_size: %s'%(s.effect_size))
    print ('---------------------\nmetric: %s'%(s.metric))
    print ('size: %s'%(s.size))

## Visualizando regiões identificadas para o conjunto Bank

In [6]:
with open('./Slices/Bank.p', 'rb') as file:
    data = pickle.load(file)
    
    data = sorted(data, key=lambda d: d.size, reverse=True)
    for d in data:
        print(f'Description of the region:  {d.filters}')
        print(f'Region instances: {d.data_idx}')
        print(f'Region size:    {d.size}')
        print(f'Efect size: {d.effect_size}')
        print(f'Log Loss:    {d.metric}')
        print('\n')

Description of the region:  {'poutcome': [[2]]}
Region instances: Index([41052, 40554, 40959, 45150, 43014, 24435, 37186, 43618, 44462, 28545,
       ...
       42739, 43252, 42197, 26865, 34202, 41160, 43016, 43703, 40438, 41939],
      dtype='int64', length=1205)
Region size:    1205
Efect size: 0.6040684598146447
Log Loss:    0.5973693648026027


Description of the region:  {'month': [[10]]}
Region instances: Index([44955, 42033, 45071, 42032, 24062, 41685, 45034, 24089, 41879, 41773,
       ...
       45121, 41672, 41872, 41850, 41661, 41997, 41863, 42067, 42011, 41939],
      dtype='int64', length=592)
Region size:    592
Efect size: 0.5693645558685498
Log Loss:    0.6505727996402865


Description of the region:  {'month': [[11]]}
Region instances: Index([41440, 41422, 41515, 41634, 44687, 41377, 41420, 41563, 44812, 41561,
       ...
       41456, 41626, 41592, 44743, 41379, 44653, 41427, 41623, 44890, 44885],
      dtype='int64', length=451)
Region size:    451
Efect size: 0.519

In [7]:
bank_train = pd.concat([X_train, y_train], axis=1, names=["age","job","marital","education","default","balance","housing","loan","contact","day",
        "month","duration","campaign","pdays","previous","poutcome","Target"])
bank_train.groupby('Target').describe()

Unnamed: 0_level_0,age,age,age,age,age,age,age,age,balance,balance,...,poutcome,poutcome,previous,previous,previous,previous,previous,previous,previous,previous
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,31937.0,40.797445,10.175616,18.0,33.0,39.0,48.0,95.0,31937.0,1304.038106,...,3.0,3.0,31937.0,0.503836,2.370703,0.0,0.0,0.0,0.0,275.0
1,4231.0,41.614276,13.53799,18.0,31.0,38.0,50.0,95.0,4231.0,1829.378634,...,3.0,3.0,4231.0,1.1697,2.604876,0.0,0.0,0.0,1.0,58.0


# Evaluating performance

In [8]:
scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

start_time_model = time.time()
rf = RandomForestClassifier(max_depth=None, n_estimators=100)
rf.fit(X_train, y_train)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf, X_train, y_train, cv=skf, scoring=scoring)


y_val_pred = rf.predict(X_val)
y_val_proba = rf.predict_proba(X_val)

val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)


print("Time to train the model:", execution_time_model, "s")

print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))

Time to train the model: 7.618726491928101 s

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.90 (+/- 0.00)
Recall: 0.42 (+/- 0.02)
F1 Score: 0.51 (+/- 0.02)
Log-loss: 0.22 (+/- 0.01)

Results on the validation set:
Accuracy: 0.90
Log-loss: 0.24

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      7985
           1       0.65      0.41      0.50      1058

    accuracy                           0.90      9043
   macro avg       0.79      0.69      0.72      9043
weighted avg       0.89      0.90      0.89      9043



## Applying SMOTE to the entire set

In [9]:
scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

smote = SMOTE(sampling_strategy='auto', random_state=42)

y_slice = y_train
X_slice = X_train

X_slice_smote, y_slice_smote = smote.fit_resample(X_slice, y_slice)
region_SMOTE = (pd.concat([pd.DataFrame(X_slice_smote, columns=X_slice.columns), pd.Series(y_slice_smote, name='Target')], axis=1))

y_SMOTE_data = region_SMOTE['Target']
X_SMOTE_data = region_SMOTE.drop(columns='Target')

start_time_model = time.time()
rf = RandomForestClassifier(max_depth=None, n_estimators=100)
rf.fit(X_SMOTE_data, y_SMOTE_data)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf, X_SMOTE_data, y_SMOTE_data, cv=skf, scoring=scoring)


y_val_pred = rf.predict(X_val)
y_val_proba = rf.predict_proba(X_val)

val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)

print("Size of new dataset:", len(X_SMOTE_data))
print("Time to train the model:", execution_time_model, "s")

print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))

Size of new dataset: 63874
Time to train the model: 16.66109037399292 s

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.93 (+/- 0.00)
Recall: 0.96 (+/- 0.00)
F1 Score: 0.94 (+/- 0.00)
Log-loss: 0.18 (+/- 0.01)

Results on the validation set:
Accuracy: 0.89
Log-loss: 0.28

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.95      0.92      0.93      7985
           1       0.51      0.65      0.57      1058

    accuracy                           0.89      9043
   macro avg       0.73      0.78      0.75      9043
weighted avg       0.90      0.89      0.89      9043



### Data augmentation guided by problem regions

In [10]:
bank_train = pd.concat([X_train, y_train], axis=1, names=["age","job","marital","education","default","balance","housing","loan","contact","day",
        "month","duration","campaign","pdays","previous","poutcome","Target"])
region = bank_train.query('poutcome==2')
region.groupby('Target').describe()

Unnamed: 0_level_0,age,age,age,age,age,age,age,age,balance,balance,...,poutcome,poutcome,previous,previous,previous,previous,previous,previous,previous,previous
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,426.0,41.607981,13.199602,19.0,32.0,38.0,50.0,86.0,426.0,1752.183099,...,2.0,2.0,426.0,3.039906,2.849281,1.0,1.0,2.0,4.0,28.0
1,779.0,43.616175,14.731628,18.0,32.0,40.0,55.0,93.0,779.0,2082.992298,...,2.0,2.0,779.0,3.105263,2.546071,1.0,1.0,2.0,4.0,21.0


In [11]:
# Define evaluation metrics
scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

# Define the region
region = 'poutcome==2'

# Apply SMOTE in the region of interest
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_region = X_train.query(region)  # Select only samples from the region
y_region = y_train[X_region.index]

# Apply SMOTE to the selected region
X_region_smote, y_region_smote = smote.fit_resample(X_region, y_region)

# Combine the original dataset with the balanced one to form the new training set
X_SMOTE_data = pd.concat([X_train.drop(X_region.index), pd.DataFrame(X_region_smote, columns=X_train.columns)])
y_SMOTE_data = pd.concat([y_train.drop(X_region.index), pd.Series(y_region_smote)])

# Train the model on the training set
start_time_model = time.time()
rf = RandomForestClassifier(max_depth=None, n_estimators=100)
rf.fit(X_SMOTE_data, y_SMOTE_data)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

# Evaluation on the training set using cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf, X_SMOTE_data, y_SMOTE_data, cv=skf, scoring=scoring)

# Evaluation on the validation set
y_val_pred = rf.predict(X_val)
y_val_proba = rf.predict_proba(X_val)

# Results on the validation set
val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)

# Display results
print("Time to train the model:", execution_time_model, "seconds")
print("Size of the new dataset:", len(X_SMOTE_data))
# Training results (cross-validation)
print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

# Results on the validation set
print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))

Time to train the model: 9.834976196289062 seconds
Size of the new dataset: 36521

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.91 (+/- 0.00)
Recall: 0.40 (+/- 0.02)
F1 Score: 0.50 (+/- 0.03)
Log-loss: 0.22 (+/- 0.02)

Results on the validation set:
Accuracy: 0.91
Log-loss: 0.23

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.92      0.97      0.95      7985
           1       0.66      0.40      0.50      1058

    accuracy                           0.91      9043
   macro avg       0.79      0.69      0.72      9043
weighted avg       0.89      0.91      0.90      9043



In [12]:
bank_train = pd.concat([X_train, y_train], axis=1, names=["age","job","marital","education","default","balance","housing","loan","contact","day",
        "month","duration","campaign","pdays","previous","poutcome","Target"])
region = bank_train.query('month==10')
region.groupby('Target').describe()

Unnamed: 0_level_0,age,age,age,age,age,age,age,age,balance,balance,...,poutcome,poutcome,previous,previous,previous,previous,previous,previous,previous,previous
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,333.0,46.315315,16.330721,19.0,33.0,42.0,60.0,95.0,333.0,2553.033033,...,3.0,3.0,333.0,1.612613,3.035447,0.0,0.0,0.0,2.0,20.0
1,259.0,45.455598,15.907709,21.0,33.0,43.0,57.0,87.0,259.0,2060.895753,...,3.0,3.0,259.0,1.552124,2.314469,0.0,0.0,1.0,2.0,14.0


In [13]:
# Define evaluation metrics
scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

# Define the region
region = 'month==10'

# Apply SMOTE in the region of interest
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_region = X_train.query(region)  # Select only samples from the region
y_region = y_train[X_region.index]

# Apply SMOTE to the selected region
X_region_smote, y_region_smote = smote.fit_resample(X_region, y_region)

# Combine the original dataset with the balanced one to form the new training set
X_SMOTE_data = pd.concat([X_train.drop(X_region.index), pd.DataFrame(X_region_smote, columns=X_train.columns)])
y_SMOTE_data = pd.concat([y_train.drop(X_region.index), pd.Series(y_region_smote)])

# Train the model on the training set
start_time_model = time.time()
rf = RandomForestClassifier(max_depth=None, n_estimators=100)
rf.fit(X_SMOTE_data, y_SMOTE_data)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

# Evaluation on the training set using cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf, X_SMOTE_data, y_SMOTE_data, cv=skf, scoring=scoring)

# Evaluation on the validation set
y_val_pred = rf.predict(X_val)
y_val_proba = rf.predict_proba(X_val)

# Results on the validation set
val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)

# Display results
print("Time to train the model:", execution_time_model, "seconds")
print("Size of the new dataset:", len(X_SMOTE_data))
# Training results (cross-validation)
print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

# Results on the validation set
print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))

Time to train the model: 10.859057903289795 seconds
Size of the new dataset: 36242

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.91 (+/- 0.00)
Recall: 0.44 (+/- 0.02)
F1 Score: 0.53 (+/- 0.02)
Log-loss: 0.22 (+/- 0.02)

Results on the validation set:
Accuracy: 0.91
Log-loss: 0.23

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      7985
           1       0.66      0.42      0.51      1058

    accuracy                           0.91      9043
   macro avg       0.79      0.69      0.73      9043
weighted avg       0.89      0.91      0.90      9043



In [14]:
bank_train = pd.concat([X_train, y_train], axis=1, names=["age","job","marital","education","default","balance","housing","loan","contact","day",
        "month","duration","campaign","pdays","previous","poutcome","Target"])
region = bank_train.query('month==11')
region.groupby('Target').describe()

Unnamed: 0_level_0,age,age,age,age,age,age,age,age,balance,balance,...,poutcome,poutcome,previous,previous,previous,previous,previous,previous,previous,previous
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,247.0,43.287449,17.206847,19.0,31.0,37.0,55.0,89.0,247.0,1841.064777,...,3.0,3.0,247.0,1.753036,3.59196,0.0,0.0,0.0,2.0,41.0
1,204.0,43.916667,15.830939,18.0,31.0,40.0,55.0,84.0,204.0,2065.617647,...,3.0,3.0,204.0,2.151961,2.902597,0.0,0.0,1.0,3.0,14.0


In [15]:
# Define evaluation metrics
scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

# Define the region
region = 'month==11'

# Apply SMOTE in the region of interest
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_region = X_train.query(region)  # Select only samples from the region
y_region = y_train[X_region.index]

# Apply SMOTE to the selected region
X_region_smote, y_region_smote = smote.fit_resample(X_region, y_region)

# Combine the original dataset with the balanced one to form the new training set
X_SMOTE_data = pd.concat([X_train.drop(X_region.index), pd.DataFrame(X_region_smote, columns=X_train.columns)])
y_SMOTE_data = pd.concat([y_train.drop(X_region.index), pd.Series(y_region_smote)])

# Train the model on the training set
start_time_model = time.time()
rf = RandomForestClassifier(max_depth=None, n_estimators=100)
rf.fit(X_SMOTE_data, y_SMOTE_data)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

# Evaluation on the training set using cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf, X_SMOTE_data, y_SMOTE_data, cv=skf, scoring=scoring)

# Evaluation on the validation set
y_val_pred = rf.predict(X_val)
y_val_proba = rf.predict_proba(X_val)

# Results on the validation set
val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)

# Display results
print("Time to train the model:", execution_time_model, "seconds")
print("Size of the new dataset:", len(X_SMOTE_data))
# Training results (cross-validation)
print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

# Results on the validation set
print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))

Time to train the model: 10.506592988967896 seconds
Size of the new dataset: 36211

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.90 (+/- 0.00)
Recall: 0.42 (+/- 0.02)
F1 Score: 0.51 (+/- 0.02)
Log-loss: 0.22 (+/- 0.02)

Results on the validation set:
Accuracy: 0.91
Log-loss: 0.24

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      7985
           1       0.65      0.42      0.51      1058

    accuracy                           0.91      9043
   macro avg       0.79      0.69      0.73      9043
weighted avg       0.89      0.91      0.90      9043



In [16]:
bank_train = pd.concat([X_train, y_train], axis=1, names=["age","job","marital","education","default","balance","housing","loan","contact","day",
        "month","duration","campaign","pdays","previous","poutcome","Target"])
region = bank_train.query('age==61')
region.groupby('Target').describe()

Unnamed: 0_level_0,age,age,age,age,age,age,age,age,balance,balance,...,poutcome,poutcome,previous,previous,previous,previous,previous,previous,previous,previous
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,75.0,61.0,0.0,61.0,61.0,61.0,61.0,61.0,75.0,1595.04,...,3.0,3.0,75.0,0.866667,2.238082,0.0,0.0,0.0,0.0,13.0
1,50.0,61.0,0.0,61.0,61.0,61.0,61.0,61.0,50.0,4788.52,...,3.0,3.0,50.0,1.06,1.658743,0.0,0.0,0.0,1.75,7.0


In [17]:
# Define evaluation metrics
scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

# Define the region
region = 'age==61'

# Apply SMOTE in the region of interest
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_region = X_train.query(region)  # Select only samples from the region
y_region = y_train[X_region.index]

# Apply SMOTE to the selected region
X_region_smote, y_region_smote = smote.fit_resample(X_region, y_region)

# Combine the original dataset with the balanced one to form the new training set
X_SMOTE_data = pd.concat([X_train.drop(X_region.index), pd.DataFrame(X_region_smote, columns=X_train.columns)])
y_SMOTE_data = pd.concat([y_train.drop(X_region.index), pd.Series(y_region_smote)])

# Train the model on the training set
start_time_model = time.time()
rf = RandomForestClassifier(max_depth=None, n_estimators=100)
rf.fit(X_SMOTE_data, y_SMOTE_data)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

# Evaluation on the training set using cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf, X_SMOTE_data, y_SMOTE_data, cv=skf, scoring=scoring)

# Evaluation on the validation set
y_val_pred = rf.predict(X_val)
y_val_proba = rf.predict_proba(X_val)

# Results on the validation set
val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)

# Display results
print("Time to train the model:", execution_time_model, "seconds")
print("Size of the new dataset:", len(X_SMOTE_data))
# Training results (cross-validation)
print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

# Results on the validation set
print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))

Time to train the model: 10.352769374847412 seconds
Size of the new dataset: 36193

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.91 (+/- 0.00)
Recall: 0.42 (+/- 0.02)
F1 Score: 0.51 (+/- 0.02)
Log-loss: 0.22 (+/- 0.01)

Results on the validation set:
Accuracy: 0.90
Log-loss: 0.23

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.92      0.97      0.95      7985
           1       0.65      0.40      0.50      1058

    accuracy                           0.90      9043
   macro avg       0.79      0.69      0.72      9043
weighted avg       0.89      0.90      0.89      9043



In [18]:
bank_train = pd.concat([X_train, y_train], axis=1, names=["age","job","marital","education","default","balance","housing","loan","contact","day",
        "month","duration","campaign","pdays","previous","poutcome","Target"])
region = bank_train.query('pdays==92')
region.groupby('Target').describe()

Unnamed: 0_level_0,age,age,age,age,age,age,age,age,balance,balance,...,poutcome,poutcome,previous,previous,previous,previous,previous,previous,previous,previous
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,40.0,43.75,16.276064,23.0,29.75,38.0,56.0,86.0,40.0,2242.5,...,2.0,2.0,40.0,3.75,3.513709,1.0,2.0,3.0,4.25,20.0
1,74.0,44.716216,15.412608,21.0,32.25,42.0,56.75,84.0,74.0,2102.0,...,2.0,2.0,74.0,3.851351,2.855816,1.0,2.0,3.0,5.0,14.0


In [19]:
# Define evaluation metrics
scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

# Define the region
region = 'pdays==92'

# Apply SMOTE in the region of interest
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_region = X_train.query(region)  # Select only samples from the region
y_region = y_train[X_region.index]

# Apply SMOTE to the selected region
X_region_smote, y_region_smote = smote.fit_resample(X_region, y_region)

# Combine the original dataset with the balanced one to form the new training set
X_SMOTE_data = pd.concat([X_train.drop(X_region.index), pd.DataFrame(X_region_smote, columns=X_train.columns)])
y_SMOTE_data = pd.concat([y_train.drop(X_region.index), pd.Series(y_region_smote)])

# Train the model on the training set
start_time_model = time.time()
rf = RandomForestClassifier(max_depth=None, n_estimators=100)
rf.fit(X_SMOTE_data, y_SMOTE_data)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

# Evaluation on the training set using cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf, X_SMOTE_data, y_SMOTE_data, cv=skf, scoring=scoring)

# Evaluation on the validation set
y_val_pred = rf.predict(X_val)
y_val_proba = rf.predict_proba(X_val)

# Results on the validation set
val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)

# Display results
print("Time to train the model:", execution_time_model, "seconds")
print("Size of the new dataset:", len(X_SMOTE_data))
# Training results (cross-validation)
print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

# Results on the validation set
print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))

Time to train the model: 12.718912124633789 seconds
Size of the new dataset: 36202

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.90 (+/- 0.00)
Recall: 0.41 (+/- 0.02)
F1 Score: 0.50 (+/- 0.02)
Log-loss: 0.22 (+/- 0.01)

Results on the validation set:
Accuracy: 0.91
Log-loss: 0.23

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.92      0.97      0.95      7985
           1       0.66      0.40      0.50      1058

    accuracy                           0.91      9043
   macro avg       0.79      0.69      0.72      9043
weighted avg       0.89      0.91      0.90      9043



In [20]:
bank_train = pd.concat([X_train, y_train], axis=1, names=["age","job","marital","education","default","balance","housing","loan","contact","day",
        "month","duration","campaign","pdays","previous","poutcome","Target"])
region = bank_train.query('pdays==91')
region.groupby('Target').describe()

Unnamed: 0_level_0,age,age,age,age,age,age,age,age,balance,balance,...,poutcome,poutcome,previous,previous,previous,previous,previous,previous,previous,previous
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,40.0,41.5,13.641941,20.0,31.75,37.0,50.5,75.0,40.0,1330.3,...,1.0,2.0,40.0,4.2,3.524275,1.0,1.0,3.5,6.0,13.0
1,60.0,46.983333,16.644327,22.0,32.75,47.0,59.25,80.0,60.0,2301.85,...,2.0,2.0,60.0,3.75,3.078768,1.0,1.75,3.0,5.0,14.0


In [21]:
# Define evaluation metrics
scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

# Define the region
region = 'pdays==91'

# Apply SMOTE in the region of interest
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_region = X_train.query(region)  # Select only samples from the region
y_region = y_train[X_region.index]

# Apply SMOTE to the selected region
X_region_smote, y_region_smote = smote.fit_resample(X_region, y_region)

# Combine the original dataset with the balanced one to form the new training set
X_SMOTE_data = pd.concat([X_train.drop(X_region.index), pd.DataFrame(X_region_smote, columns=X_train.columns)])
y_SMOTE_data = pd.concat([y_train.drop(X_region.index), pd.Series(y_region_smote)])

# Train the model on the training set
start_time_model = time.time()
rf = RandomForestClassifier(max_depth=None, n_estimators=100)
rf.fit(X_SMOTE_data, y_SMOTE_data)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

# Evaluation on the training set using cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf, X_SMOTE_data, y_SMOTE_data, cv=skf, scoring=scoring)

# Evaluation on the validation set
y_val_pred = rf.predict(X_val)
y_val_proba = rf.predict_proba(X_val)

# Results on the validation set
val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)

# Display results
print("Time to train the model:", execution_time_model, "seconds")
print("Size of the new dataset:", len(X_SMOTE_data))
# Training results (cross-validation)
print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

# Results on the validation set
print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))

Time to train the model: 8.763309001922607 seconds
Size of the new dataset: 36188

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.91 (+/- 0.00)
Recall: 0.42 (+/- 0.02)
F1 Score: 0.51 (+/- 0.02)
Log-loss: 0.22 (+/- 0.02)

Results on the validation set:
Accuracy: 0.91
Log-loss: 0.22

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      7985
           1       0.66      0.41      0.50      1058

    accuracy                           0.91      9043
   macro avg       0.79      0.69      0.73      9043
weighted avg       0.89      0.91      0.90      9043



### removal of the problematic region

In [22]:
def import_bank():
    df_bank = pd.read_csv(
    "sets/bank/bank-full.csv",
        names=["age","job","marital","education","default","balance","housing","loan","contact","day",
        "month","duration","campaign","pdays","previous","poutcome","Target"],
        sep=r';',
        engine='python',
        na_values="?")

    df_bank = df_bank.dropna()

    # Encode categorical features
    encoders = {}
    for column in df_bank.columns:
        if df_bank.dtypes[column] == np.object_:
            le = LabelEncoder()
            df_bank[column] = le.fit_transform(df_bank[column])
            encoders[column] = le
            #print(column, le.classes_, le.transform(le.classes_))

    X, y = df_bank[df_bank.columns.difference(["Target"])], df_bank["Target"]

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    return df_bank, X_train, X_val, y_train, y_val

In [23]:
df_bank, X_train, X_val, y_train, y_val = import_bank()

bank_train = pd.concat([X_train, y_train], axis=1, names=["age","job","marital","education","default","balance","housing","loan","contact","day",
        "month","duration","campaign","pdays","previous","poutcome","Target"])

# Define evaluation metrics
scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

# Defines the region for removal
region = 'poutcome==2'

# Identifies and removes samples from the specified region
region_remove_idx = bank_train.query(region).index
bank_train.drop(region_remove_idx, errors='ignore', inplace=True)

# Ensures that training set columns remain consistent
X_slice = bank_train.drop(columns=['Target'])
y_slice = bank_train['Target']

# Reindex the validation set to ensure consistency with the training set
X_val = X_val.reindex(columns=X_slice.columns, fill_value=0)

# Train the model on the training set
start_time_model = time.time()
rf = RandomForestClassifier(max_depth=None, n_estimators=100)
rf.fit(X_slice, y_slice)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

# Evaluation on the training set using cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf, X_slice, y_slice, cv=skf, scoring=scoring)

# Evaluation on the validation set
y_val_pred = rf.predict(X_val)
y_val_proba = rf.predict_proba(X_val)

#Results on the validation set
val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)

# Display results
print("Time to train the model:", execution_time_model, "seconds")
print("Size of the dataset after removal:", len(X_slice))
# Training results (cross-validation)
print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

# Results on the validation set
print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))

Time to train the model: 15.496399641036987 seconds
Size of the dataset after removal: 34963

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.91 (+/- 0.00)
Recall: 0.31 (+/- 0.03)
F1 Score: 0.40 (+/- 0.03)
Log-loss: 0.21 (+/- 0.01)

Results on the validation set:
Accuracy: 0.90
Log-loss: 0.25

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.91      0.98      0.94      7985
           1       0.63      0.28      0.39      1058

    accuracy                           0.90      9043
   macro avg       0.77      0.63      0.67      9043
weighted avg       0.88      0.90      0.88      9043



In [24]:
df_bank, X_train, X_val, y_train, y_val = import_bank()

bank_train = pd.concat([X_train, y_train], axis=1, names=["age","job","marital","education","default","balance","housing","loan","contact","day",
        "month","duration","campaign","pdays","previous","poutcome","Target"])

# Define evaluation metrics
scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

# Defines the region for removal
region = 'month==10'

# Identifies and removes samples from the specified region
region_remove_idx = bank_train.query(region).index
bank_train.drop(region_remove_idx, errors='ignore', inplace=True)

# Ensures that training set columns remain consistent
X_slice = bank_train.drop(columns=['Target'])
y_slice = bank_train['Target']

# Reindex the validation set to ensure consistency with the training set
X_val = X_val.reindex(columns=X_slice.columns, fill_value=0)

# Train the model on the training set
start_time_model = time.time()
rf = RandomForestClassifier(max_depth=None, n_estimators=100)
rf.fit(X_slice, y_slice)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

# Evaluation on the training set using cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf, X_slice, y_slice, cv=skf, scoring=scoring)

# Evaluation on the validation set
y_val_pred = rf.predict(X_val)
y_val_proba = rf.predict_proba(X_val)

#Results on the validation set
val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)

# Display results
print("Time to train the model:", execution_time_model, "seconds")
print("Size of the dataset after removal:", len(X_slice))
# Training results (cross-validation)
print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

# Results on the validation set
print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))

Time to train the model: 9.343470335006714 seconds
Size of the dataset after removal: 35576

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.91 (+/- 0.00)
Recall: 0.40 (+/- 0.02)
F1 Score: 0.50 (+/- 0.02)
Log-loss: 0.22 (+/- 0.02)

Results on the validation set:
Accuracy: 0.90
Log-loss: 0.25

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.92      0.97      0.95      7985
           1       0.64      0.39      0.48      1058

    accuracy                           0.90      9043
   macro avg       0.78      0.68      0.71      9043
weighted avg       0.89      0.90      0.89      9043



In [25]:
df_bank, X_train, X_val, y_train, y_val = import_bank()

bank_train = pd.concat([X_train, y_train], axis=1, names=["age","job","marital","education","default","balance","housing","loan","contact","day",
        "month","duration","campaign","pdays","previous","poutcome","Target"])

# Define evaluation metrics
scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

# Defines the region for removal
region = 'month==11'

# Identifies and removes samples from the specified region
region_remove_idx = bank_train.query(region).index
bank_train.drop(region_remove_idx, errors='ignore', inplace=True)

# Ensures that training set columns remain consistent
X_slice = bank_train.drop(columns=['Target'])
y_slice = bank_train['Target']

# Reindex the validation set to ensure consistency with the training set
X_val = X_val.reindex(columns=X_slice.columns, fill_value=0)

# Train the model on the training set
start_time_model = time.time()
rf = RandomForestClassifier(max_depth=None, n_estimators=100)
rf.fit(X_slice, y_slice)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

# Evaluation on the training set using cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf, X_slice, y_slice, cv=skf, scoring=scoring)

# Evaluation on the validation set
y_val_pred = rf.predict(X_val)
y_val_proba = rf.predict_proba(X_val)

#Results on the validation set
val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)

# Display results
print("Time to train the model:", execution_time_model, "seconds")
print("Size of the dataset after removal:", len(X_slice))
# Training results (cross-validation)
print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

# Results on the validation set
print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))

Time to train the model: 9.720123529434204 seconds
Size of the dataset after removal: 35717

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.91 (+/- 0.00)
Recall: 0.40 (+/- 0.03)
F1 Score: 0.49 (+/- 0.02)
Log-loss: 0.22 (+/- 0.01)

Results on the validation set:
Accuracy: 0.90
Log-loss: 0.23

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.92      0.97      0.95      7985
           1       0.64      0.40      0.49      1058

    accuracy                           0.90      9043
   macro avg       0.78      0.68      0.72      9043
weighted avg       0.89      0.90      0.89      9043



In [26]:
df_bank, X_train, X_val, y_train, y_val = import_bank()

bank_train = pd.concat([X_train, y_train], axis=1, names=["age","job","marital","education","default","balance","housing","loan","contact","day",
        "month","duration","campaign","pdays","previous","poutcome","Target"])

# Define evaluation metrics
scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

# Defines the region for removal
region = 'age==61'

# Identifies and removes samples from the specified region
region_remove_idx = bank_train.query(region).index
bank_train.drop(region_remove_idx, errors='ignore', inplace=True)

# Ensures that training set columns remain consistent
X_slice = bank_train.drop(columns=['Target'])
y_slice = bank_train['Target']

# Reindex the validation set to ensure consistency with the training set
X_val = X_val.reindex(columns=X_slice.columns, fill_value=0)

# Train the model on the training set
start_time_model = time.time()
rf = RandomForestClassifier(max_depth=None, n_estimators=100)
rf.fit(X_slice, y_slice)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

# Evaluation on the training set using cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf, X_slice, y_slice, cv=skf, scoring=scoring)

# Evaluation on the validation set
y_val_pred = rf.predict(X_val)
y_val_proba = rf.predict_proba(X_val)

#Results on the validation set
val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)

# Display results
print("Time to train the model:", execution_time_model, "seconds")
print("Size of the dataset after removal:", len(X_slice))
# Training results (cross-validation)
print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

# Results on the validation set
print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))

Time to train the model: 9.151746034622192 seconds
Size of the dataset after removal: 36043

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.91 (+/- 0.00)
Recall: 0.41 (+/- 0.02)
F1 Score: 0.50 (+/- 0.02)
Log-loss: 0.22 (+/- 0.01)

Results on the validation set:
Accuracy: 0.91
Log-loss: 0.22

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      7985
           1       0.65      0.41      0.50      1058

    accuracy                           0.91      9043
   macro avg       0.79      0.69      0.73      9043
weighted avg       0.89      0.91      0.90      9043



In [27]:
df_bank, X_train, X_val, y_train, y_val = import_bank()

bank_train = pd.concat([X_train, y_train], axis=1, names=["age","job","marital","education","default","balance","housing","loan","contact","day",
        "month","duration","campaign","pdays","previous","poutcome","Target"])

# Define evaluation metrics
scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

# Defines the region for removal
region = 'pdays==91'

# Identifies and removes samples from the specified region
region_remove_idx = bank_train.query(region).index
bank_train.drop(region_remove_idx, errors='ignore', inplace=True)

# Ensures that training set columns remain consistent
X_slice = bank_train.drop(columns=['Target'])
y_slice = bank_train['Target']

# Reindex the validation set to ensure consistency with the training set
X_val = X_val.reindex(columns=X_slice.columns, fill_value=0)

# Train the model on the training set
start_time_model = time.time()
rf = RandomForestClassifier(max_depth=None, n_estimators=100)
rf.fit(X_slice, y_slice)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

# Evaluation on the training set using cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf, X_slice, y_slice, cv=skf, scoring=scoring)

# Evaluation on the validation set
y_val_pred = rf.predict(X_val)
y_val_proba = rf.predict_proba(X_val)

#Results on the validation set
val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)

# Display results
print("Time to train the model:", execution_time_model, "seconds")
print("Size of the dataset after removal:", len(X_slice))
# Training results (cross-validation)
print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

# Results on the validation set
print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))

Time to train the model: 13.17482328414917 seconds
Size of the dataset after removal: 36068

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.91 (+/- 0.00)
Recall: 0.41 (+/- 0.02)
F1 Score: 0.50 (+/- 0.02)
Log-loss: 0.22 (+/- 0.01)

Results on the validation set:
Accuracy: 0.91
Log-loss: 0.23

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.92      0.97      0.95      7985
           1       0.66      0.40      0.50      1058

    accuracy                           0.91      9043
   macro avg       0.79      0.69      0.72      9043
weighted avg       0.89      0.91      0.89      9043



In [28]:
df_bank, X_train, X_val, y_train, y_val = import_bank()

bank_train = pd.concat([X_train, y_train], axis=1, names=["age","job","marital","education","default","balance","housing","loan","contact","day",
        "month","duration","campaign","pdays","previous","poutcome","Target"])

# Define evaluation metrics
scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

# Defines the region for removal
region = 'pdays==92'

# Identifies and removes samples from the specified region
region_remove_idx = bank_train.query(region).index
bank_train.drop(region_remove_idx, errors='ignore', inplace=True)

# Ensures that training set columns remain consistent
X_slice = bank_train.drop(columns=['Target'])
y_slice = bank_train['Target']

# Reindex the validation set to ensure consistency with the training set
X_val = X_val.reindex(columns=X_slice.columns, fill_value=0)

# Train the model on the training set
start_time_model = time.time()
rf = RandomForestClassifier(max_depth=None, n_estimators=100)
rf.fit(X_slice, y_slice)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

# Evaluation on the training set using cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf, X_slice, y_slice, cv=skf, scoring=scoring)

# Evaluation on the validation set
y_val_pred = rf.predict(X_val)
y_val_proba = rf.predict_proba(X_val)

#Results on the validation set
val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)

# Display results
print("Time to train the model:", execution_time_model, "seconds")
print("Size of the dataset after removal:", len(X_slice))
# Training results (cross-validation)
print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

# Results on the validation set
print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))

Time to train the model: 11.851913213729858 seconds
Size of the dataset after removal: 36054

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.91 (+/- 0.00)
Recall: 0.42 (+/- 0.02)
F1 Score: 0.51 (+/- 0.02)
Log-loss: 0.22 (+/- 0.02)

Results on the validation set:
Accuracy: 0.90
Log-loss: 0.24

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      7985
           1       0.64      0.41      0.50      1058

    accuracy                           0.90      9043
   macro avg       0.78      0.69      0.72      9043
weighted avg       0.89      0.90      0.89      9043



### random region removal

In [36]:
df_bank, X_train, X_val, y_train, y_val = import_bank()

bank_train = pd.concat([X_train, y_train], axis=1, names=["age","job","marital","education","default","balance","housing","loan","contact","day",
        "month","duration","campaign","pdays","previous","poutcome","Target"])

scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

# Randomly remove a defined amount of rows
remove_rows = bank_train.sample(n=len(df_bank.query('poutcome==2').index), random_state=42).index
bank_train.drop(remove_rows, inplace=True)

y_slice = bank_train['Target']
X_slice = bank_train.drop(columns=['Target'])

X_val = X_val.reindex(columns=X_slice.columns, fill_value=0)

start_time_model = time.time()
rf = RandomForestClassifier(max_depth=None, n_estimators=100)
rf.fit(X_slice, y_slice)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf, X_slice, y_slice, cv=skf, scoring=scoring)

y_val_pred = rf.predict(X_val)
y_val_proba = rf.predict_proba(X_val)

val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)

# Display results
print("Time to train the model:", execution_time_model, "seconds")
print("Size of the dataset after removal:", len(X_slice))
# Training results (cross-validation)
print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

# Results on the validation set
print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))

Time to train the model: 8.90275239944458 seconds
Size of the dataset after removal: 34657

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.91 (+/- 0.00)
Recall: 0.41 (+/- 0.02)
F1 Score: 0.50 (+/- 0.01)
Log-loss: 0.22 (+/- 0.01)

Results on the validation set:
Accuracy: 0.91
Log-loss: 0.24

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.92      0.97      0.95      7985
           1       0.66      0.40      0.50      1058

    accuracy                           0.91      9043
   macro avg       0.79      0.69      0.72      9043
weighted avg       0.89      0.91      0.90      9043



In [37]:
df_bank, X_train, X_val, y_train, y_val = import_bank()

bank_train = pd.concat([X_train, y_train], axis=1, names=["age","job","marital","education","default","balance","housing","loan","contact","day",
        "month","duration","campaign","pdays","previous","poutcome","Target"])

scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

# Randomly remove a defined amount of rows
remove_rows = bank_train.sample(n=len(df_bank.query('month==10').index), random_state=42).index
bank_train.drop(remove_rows, inplace=True)

y_slice = bank_train['Target']
X_slice = bank_train.drop(columns=['Target'])

X_val = X_val.reindex(columns=X_slice.columns, fill_value=0)

start_time_model = time.time()
rf = RandomForestClassifier(max_depth=None, n_estimators=100)
rf.fit(X_slice, y_slice)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf, X_slice, y_slice, cv=skf, scoring=scoring)

y_val_pred = rf.predict(X_val)
y_val_proba = rf.predict_proba(X_val)

val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)

# Display results
print("Time to train the model:", execution_time_model, "seconds")
print("Size of the dataset after removal:", len(X_slice))
# Training results (cross-validation)
print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

# Results on the validation set
print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))

Time to train the model: 9.130471229553223 seconds
Size of the dataset after removal: 35430

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.91 (+/- 0.00)
Recall: 0.42 (+/- 0.03)
F1 Score: 0.51 (+/- 0.02)
Log-loss: 0.22 (+/- 0.01)

Results on the validation set:
Accuracy: 0.91
Log-loss: 0.23

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.92      0.97      0.95      7985
           1       0.66      0.40      0.50      1058

    accuracy                           0.91      9043
   macro avg       0.79      0.69      0.72      9043
weighted avg       0.89      0.91      0.90      9043



In [38]:
df_bank, X_train, X_val, y_train, y_val = import_bank()

bank_train = pd.concat([X_train, y_train], axis=1, names=["age","job","marital","education","default","balance","housing","loan","contact","day",
        "month","duration","campaign","pdays","previous","poutcome","Target"])

scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

# Randomly remove a defined amount of rows
remove_rows = bank_train.sample(n=len(df_bank.query('month==11').index), random_state=42).index
bank_train.drop(remove_rows, inplace=True)

y_slice = bank_train['Target']
X_slice = bank_train.drop(columns=['Target'])

X_val = X_val.reindex(columns=X_slice.columns, fill_value=0)

start_time_model = time.time()
rf = RandomForestClassifier(max_depth=None, n_estimators=100)
rf.fit(X_slice, y_slice)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf, X_slice, y_slice, cv=skf, scoring=scoring)

y_val_pred = rf.predict(X_val)
y_val_proba = rf.predict_proba(X_val)

val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)

# Display results
print("Time to train the model:", execution_time_model, "seconds")
print("Size of the dataset after removal:", len(X_slice))
# Training results (cross-validation)
print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

# Results on the validation set
print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))

Time to train the model: 10.921363115310669 seconds
Size of the dataset after removal: 35589

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.91 (+/- 0.00)
Recall: 0.42 (+/- 0.02)
F1 Score: 0.51 (+/- 0.01)
Log-loss: 0.22 (+/- 0.02)

Results on the validation set:
Accuracy: 0.91
Log-loss: 0.24

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      7985
           1       0.66      0.41      0.51      1058

    accuracy                           0.91      9043
   macro avg       0.80      0.69      0.73      9043
weighted avg       0.90      0.91      0.90      9043



In [39]:
df_bank, X_train, X_val, y_train, y_val = import_bank()

bank_train = pd.concat([X_train, y_train], axis=1, names=["age","job","marital","education","default","balance","housing","loan","contact","day",
        "month","duration","campaign","pdays","previous","poutcome","Target"])

scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

# Randomly remove a defined amount of rows
remove_rows = bank_train.sample(n=len(df_bank.query('age==61').index), random_state=42).index
bank_train.drop(remove_rows, inplace=True)

y_slice = bank_train['Target']
X_slice = bank_train.drop(columns=['Target'])

X_val = X_val.reindex(columns=X_slice.columns, fill_value=0)

start_time_model = time.time()
rf = RandomForestClassifier(max_depth=None, n_estimators=100)
rf.fit(X_slice, y_slice)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf, X_slice, y_slice, cv=skf, scoring=scoring)

y_val_pred = rf.predict(X_val)
y_val_proba = rf.predict_proba(X_val)

val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)

# Display results
print("Time to train the model:", execution_time_model, "seconds")
print("Size of the dataset after removal:", len(X_slice))
# Training results (cross-validation)
print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

# Results on the validation set
print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))

Time to train the model: 9.368620872497559 seconds
Size of the dataset after removal: 36021

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.90 (+/- 0.00)
Recall: 0.42 (+/- 0.02)
F1 Score: 0.51 (+/- 0.02)
Log-loss: 0.22 (+/- 0.01)

Results on the validation set:
Accuracy: 0.91
Log-loss: 0.23

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.92      0.97      0.95      7985
           1       0.66      0.40      0.50      1058

    accuracy                           0.91      9043
   macro avg       0.79      0.69      0.72      9043
weighted avg       0.89      0.91      0.90      9043



In [40]:
df_bank, X_train, X_val, y_train, y_val = import_bank()

bank_train = pd.concat([X_train, y_train], axis=1, names=["age","job","marital","education","default","balance","housing","loan","contact","day",
        "month","duration","campaign","pdays","previous","poutcome","Target"])

scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

# Randomly remove a defined amount of rows
remove_rows = bank_train.sample(n=len(df_bank.query('pdays==91').index), random_state=42).index
bank_train.drop(remove_rows, inplace=True)

y_slice = bank_train['Target']
X_slice = bank_train.drop(columns=['Target'])

X_val = X_val.reindex(columns=X_slice.columns, fill_value=0)

start_time_model = time.time()
rf = RandomForestClassifier(max_depth=None, n_estimators=100)
rf.fit(X_slice, y_slice)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf, X_slice, y_slice, cv=skf, scoring=scoring)

y_val_pred = rf.predict(X_val)
y_val_proba = rf.predict_proba(X_val)

val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)

# Display results
print("Time to train the model:", execution_time_model, "seconds")
print("Size of the dataset after removal:", len(X_slice))
# Training results (cross-validation)
print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

# Results on the validation set
print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))

Time to train the model: 8.22896933555603 seconds
Size of the dataset after removal: 36042

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.91 (+/- 0.00)
Recall: 0.42 (+/- 0.02)
F1 Score: 0.51 (+/- 0.02)
Log-loss: 0.22 (+/- 0.01)

Results on the validation set:
Accuracy: 0.90
Log-loss: 0.23

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.92      0.97      0.95      7985
           1       0.65      0.40      0.50      1058

    accuracy                           0.90      9043
   macro avg       0.79      0.69      0.72      9043
weighted avg       0.89      0.90      0.89      9043



In [41]:
df_bank, X_train, X_val, y_train, y_val = import_bank()

bank_train = pd.concat([X_train, y_train], axis=1, names=["age","job","marital","education","default","balance","housing","loan","contact","day",
        "month","duration","campaign","pdays","previous","poutcome","Target"])

scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

# Randomly remove a defined amount of rows
remove_rows = bank_train.sample(n=len(df_bank.query('pdays==92').index), random_state=42).index
bank_train.drop(remove_rows, inplace=True)

y_slice = bank_train['Target']
X_slice = bank_train.drop(columns=['Target'])

X_val = X_val.reindex(columns=X_slice.columns, fill_value=0)

start_time_model = time.time()
rf = RandomForestClassifier(max_depth=None, n_estimators=100)
rf.fit(X_slice, y_slice)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf, X_slice, y_slice, cv=skf, scoring=scoring)

y_val_pred = rf.predict(X_val)
y_val_proba = rf.predict_proba(X_val)

val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)

# Display results
print("Time to train the model:", execution_time_model, "seconds")
print("Size of the dataset after removal:", len(X_slice))
# Training results (cross-validation)
print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

# Results on the validation set
print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))

Time to train the model: 10.400350332260132 seconds
Size of the dataset after removal: 36021

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.90 (+/- 0.00)
Recall: 0.42 (+/- 0.02)
F1 Score: 0.51 (+/- 0.02)
Log-loss: 0.22 (+/- 0.01)

Results on the validation set:
Accuracy: 0.91
Log-loss: 0.24

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      7985
           1       0.66      0.41      0.51      1058

    accuracy                           0.91      9043
   macro avg       0.79      0.69      0.73      9043
weighted avg       0.89      0.91      0.90      9043



# Case study

In [42]:
with open('Slices/Bank.p', 'rb') as file:
    data = pickle.load(file)
    
    name = []
    aux_1 = []
    aux_2 = []
    
    data = sorted(data, key=lambda d: d.size, reverse=True)
    for d in data:
        print ('\n=====================\nSlice description:')
        for k, v in list(d.filters.items()):
            values = ''
            if k in encoders:
                le = encoders[k]
                for v_ in v:
                    values += '%s '%(le.inverse_transform(v_)[0])
            else:
                for v_ in sorted(v, key=lambda x: x[0]):
                    if len(v_) > 1:
                        values += '%s ~ %s'%(v_[0], v_[1])
                    else:
                        values += '%s '%(v_[0])
            print ('%s:%s'%(k, values))
        #print(f'Description of the region:  {d.filters}')
        print(f'Region instances: {d.data_idx}')
        print(f'Region size:    {d.size}')
        print(f'Efect size: {d.effect_size}')
        print(f'Log Loss:    {d.metric}')
        print('\n')
        
        if d.size > 10:
            aux_1.append(d.size)
            aux_2.append(d.effect_size)


Slice description:
poutcome:success 
Region instances: Index([41052, 40554, 40959, 45150, 43014, 24435, 37186, 43618, 44462, 28545,
       ...
       42739, 43252, 42197, 26865, 34202, 41160, 43016, 43703, 40438, 41939],
      dtype='int64', length=1205)
Region size:    1205
Efect size: 0.6040684598146447
Log Loss:    0.5973693648026027



Slice description:
month:oct 
Region instances: Index([44955, 42033, 45071, 42032, 24062, 41685, 45034, 24089, 41879, 41773,
       ...
       45121, 41672, 41872, 41850, 41661, 41997, 41863, 42067, 42011, 41939],
      dtype='int64', length=592)
Region size:    592
Efect size: 0.5693645558685498
Log Loss:    0.6505727996402865



Slice description:
month:sep 
Region instances: Index([41440, 41422, 41515, 41634, 44687, 41377, 41420, 41563, 44812, 41561,
       ...
       41456, 41626, 41592, 44743, 41379, 44653, 41427, 41623, 44890, 44885],
      dtype='int64', length=451)
Region size:    451
Efect size: 0.5199535546983456
Log Loss:    0.60863908410

In [43]:
from collections import Counter
import ast

dicionario = {}

for d in data:
    if d.size > 10:
        dicionario[f'{d.filters}'] = []
        for i in d.data_idx:
            dicionario[f'{d.filters}'].append(i)

feature_cont = Counter()

for feature in dicionario.keys():
    key_dict = ast.literal_eval(feature)
    for feature in key_dict.keys():
        feature_cont[feature] += 1
        
print(feature_cont)

Counter({'duration': 79, 'pdays': 25, 'age': 23, 'balance': 13, 'month': 4, 'poutcome': 1, 'previous': 1})


In [None]:
df_bank, X_train, X_val, y_train, y_val = import_bank()

bank_train = pd.concat([X_train, y_train], axis=1, names=["age","job","marital","education","default","balance","housing","loan","contact","day",
        "month","duration","campaign","pdays","previous","poutcome","Target"])

bank_train.groupby('balance').describe()

In [45]:
df_bank, X_train, X_val, y_train, y_val = import_bank()

bank_train = pd.concat([X_train, y_train], axis=1, names=["age","job","marital","education","default","balance","housing","loan","contact","day",
        "month","duration","campaign","pdays","previous","poutcome","Target"])

bank_train.groupby('duration').describe()

Unnamed: 0_level_0,age,age,age,age,age,age,age,age,balance,balance,...,previous,previous,Target,Target,Target,Target,Target,Target,Target,Target
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
duration,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,2.0,44.000000,12.727922,35.0,39.50,44.0,48.50,53.0,2.0,2943.000000,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.0,33.500000,7.778175,28.0,30.75,33.5,36.25,39.0,2.0,9518.000000,...,3.0,4.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,36.000000,,36.0,36.00,36.0,36.00,36.0,1.0,871.000000,...,0.0,0.0,1.0,0.0,,0.0,0.0,0.0,0.0,0.0
3,4.0,34.000000,7.393691,25.0,31.00,34.0,37.00,43.0,4.0,83.750000,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7.0,33.285714,10.339476,22.0,26.50,35.0,35.00,53.0,7.0,307.428571,...,1.0,2.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3366,1.0,45.000000,,45.0,45.00,45.0,45.00,45.0,1.0,66.000000,...,0.0,0.0,1.0,0.0,,0.0,0.0,0.0,0.0,0.0
3422,1.0,37.000000,,37.0,37.00,37.0,37.00,37.0,1.0,359.000000,...,0.0,0.0,1.0,0.0,,0.0,0.0,0.0,0.0,0.0
3785,1.0,45.000000,,45.0,45.00,45.0,45.00,45.0,1.0,955.000000,...,0.0,0.0,1.0,0.0,,0.0,0.0,0.0,0.0,0.0
3881,1.0,59.000000,,59.0,59.00,59.0,59.00,59.0,1.0,1321.000000,...,0.0,0.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0


In [44]:
df_bank, X_train, X_val, y_train, y_val = import_bank()

bank_train = pd.concat([X_train, y_train], axis=1, names=["age","job","marital","education","default","balance","housing","loan","contact","day",
        "month","duration","campaign","pdays","previous","poutcome","Target"])
bank_val = pd.concat([X_val, y_val], axis=1, names=["age","job","marital","education","default","balance","housing","loan","contact","day",
        "month","duration","campaign","pdays","previous","poutcome","Target"])

bank_train['duration_interval'] = pd.cut(bank_train['duration'], bins=15, labels=False)

bank_train['balance_interval'] = pd.cut(bank_train['balance'], bins=15, labels=False)

bank_train['pdays_interval'] = pd.cut(bank_train['pdays'], bins=2, labels=False)
bank_train.drop(columns=["pdays"], inplace=True)

bank_val['duration_interval'] = pd.cut(bank_val['duration'], bins=15, labels=False)

bank_val['balance_interval'] = pd.cut(bank_val['balance'], bins=15, labels=False)

bank_val['pdays_interval'] = pd.cut(bank_val['pdays'], bins=2, labels=False)
bank_val.drop(columns=["pdays"], inplace=True)

y_train = bank_train['Target']
X_train = bank_train.drop(columns='Target')

y_val = bank_val['Target']
X_val = bank_val.drop(columns='Target')

scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

start_time_model = time.time()
rf = RandomForestClassifier(max_depth=None, n_estimators=100)
rf.fit(X_train, y_train)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf, X_train, y_train, cv=skf, scoring=scoring)


y_val_pred = rf.predict(X_val)
y_val_proba = rf.predict_proba(X_val)

val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)

print("Time to train the model:", execution_time_model, "s")

print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))

Time to train the model: 9.365793943405151 s

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.90 (+/- 0.00)
Recall: 0.42 (+/- 0.02)
F1 Score: 0.50 (+/- 0.02)
Log-loss: 0.22 (+/- 0.01)

Results on the validation set:
Accuracy: 0.90
Log-loss: 0.23

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.93      0.96      0.95      7985
           1       0.62      0.44      0.52      1058

    accuracy                           0.90      9043
   macro avg       0.78      0.70      0.73      9043
weighted avg       0.89      0.90      0.90      9043



In [46]:
df_bank = pd.read_csv(
    "sets/bank/bank-full.csv",
        names=["age","job","marital","education","default","balance","housing","loan","contact","day",
        "month","duration","campaign","pdays","previous","poutcome","Target"],
        sep=r';',
        engine='python',
        na_values="?")

df_bank = df_bank.dropna()

# Encode categorical features
encoders = {}
for column in df_bank.columns:
    if df_bank.dtypes[column] == np.object_:
        le = LabelEncoder()
        df_bank[column] = le.fit_transform(df_bank[column])
        encoders[column] = le
        #print(column, le.classes_, le.transform(le.classes_))

X, y = df_bank[df_bank.columns.difference(["Target"])], df_bank["Target"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

bank_train = pd.concat([X_train, y_train], axis=1, names=["age","job","marital","education","default","balance","housing","loan","contact","day",
        "month","duration","campaign","pdays","previous","poutcome","Target"])
bank_val = pd.concat([X_val, y_val], axis=1, names=["age","job","marital","education","default","balance","housing","loan","contact","day",
        "month","duration","campaign","pdays","previous","poutcome","Target"])

bank_train['duration_interval'] = pd.cut(bank_train['duration'], bins=15, labels=False)

bank_train['balance_interval'] = pd.cut(bank_train['balance'], bins=15, labels=False)

bank_train['pdays_interval'] = pd.cut(bank_train['pdays'], bins=2, labels=False)
bank_train.drop(columns=["pdays"], inplace=True)

bank_val['duration_interval'] = pd.cut(bank_val['duration'], bins=15, labels=False)

bank_val['balance_interval'] = pd.cut(bank_val['balance'], bins=15, labels=False)

bank_val['pdays_interval'] = pd.cut(bank_val['pdays'], bins=2, labels=False)
bank_val.drop(columns=["pdays"], inplace=True)

y_train = bank_train['Target']
X_train = bank_train.drop(columns='Target')

y_val = bank_val['Target']
X_val = bank_val.drop(columns='Target')

In [None]:
lr = RandomForestClassifier(max_depth=5, n_estimators=10)
lr.fit(X_train, y_train)

sf = SliceFinder(lr, (X_train, y_train))
metrics_all = sf.evaluate_model((X_train,y_train))
reference = (np.mean(metrics_all), np.std(metrics_all), len(metrics_all))

name = 'Bank_corrigido.p'
recommendations = sf.find_slice(k=100, epsilon=0.4, degree=2, max_workers=4, name=name)

for s in recommendations:
    print ('\n=====================\nSlice description:')
    for k, v in list(s.filters.items()):
        values = ''
        if k in encoders:
            le = encoders[k]
            for v_ in v:
                values += '%s '%(le.inverse_transform(v_)[0])
        else:
            for v_ in sorted(v, key=lambda x: x[0]):
                if len(v_) > 1:
                    values += '%s ~ %s'%(v_[0], v_[1])
                else:
                    values += '%s '%(v_[0])
        print ('%s:%s'%(k, values))
    print ('---------------------\neffect_size: %s'%(s.effect_size))
    print ('---------------------\nmetric: %s'%(s.metric))
    print ('size: %s'%(s.size))

In [47]:
with open('Slices/Bank_corrigido.p', 'rb') as file:
    data = pickle.load(file)
    
    name = []
    aux_1 = []
    aux_2 = []
    
    data = sorted(data, key=lambda d: d.size, reverse=True)
    for d in data:
        print ('\n=====================\nSlice description:')
        for k, v in list(d.filters.items()):
            values = ''
            if k in encoders:
                le = encoders[k]
                for v_ in v:
                    values += '%s '%(le.inverse_transform(v_)[0])
            else:
                for v_ in sorted(v, key=lambda x: x[0]):
                    if len(v_) > 1:
                        values += '%s ~ %s'%(v_[0], v_[1])
                    else:
                        values += '%s '%(v_[0])
            print ('%s:%s'%(k, values))
        #print(f'Description of the region:  {d.filters}')
        print(f'Region instances: {d.data_idx}')
        print(f'Region size:    {d.size}')
        print(f'Efect size: {d.effect_size}')
        print(f'Log Loss:    {d.metric}')
        print('\n')
        
        if d.size > 10:
            aux_1.append(d.size)
            aux_2.append(d.effect_size)


Slice description:
duration_interval:2 
Region instances: Index([43409, 31752, 18161,  6748, 43128,  4069, 39459,  7457,  7543, 43173,
       ...
       29697, 14870, 15713, 29866, 41863, 37451, 15968, 37668, 16712,  3851],
      dtype='int64', length=1591)
Region size:    1591
Efect size: 0.7489604964494344
Log Loss:    0.6625241942155328



Slice description:
poutcome:success 
Region instances: Index([41052, 40554, 40959, 45150, 43014, 24435, 37186, 43618, 44462, 28545,
       ...
       42739, 43252, 42197, 26865, 34202, 41160, 43016, 43703, 40438, 41939],
      dtype='int64', length=1205)
Region size:    1205
Efect size: 0.756020969473333
Log Loss:    0.731487072887041



Slice description:
month:oct 
Region instances: Index([44955, 42033, 45071, 42032, 24062, 41685, 45034, 24089, 41879, 41773,
       ...
       45121, 41672, 41872, 41850, 41661, 41997, 41863, 42067, 42011, 41939],
      dtype='int64', length=592)
Region size:    592
Efect size: 0.5817356696789396
Log Loss:    0.6

In [48]:
from collections import Counter
import ast

dicionario = {}

for d in data:
    if d.size > 10:
        dicionario[f'{d.filters}'] = []
        for i in d.data_idx:
            dicionario[f'{d.filters}'].append(i)

feature_cont = Counter()

for feature in dicionario.keys():
    key_dict = ast.literal_eval(feature)
    for feature in key_dict.keys():
        feature_cont[feature] += 1
        
print(feature_cont)

Counter({'duration': 86, 'age': 24, 'balance': 20, 'duration_interval': 5, 'month': 4, 'poutcome': 1, 'pdays_interval': 1, 'previous': 1})


In [49]:
bank_val = pd.concat([X_val, y_val], axis=1, names=["age","job","marital","education","default","balance","housing","loan","contact","day",
        "month","duration","campaign","pdays","previous","poutcome","Target"])
region = bank_val.query('duration_interval==2')
region.groupby('Target').describe()

Unnamed: 0_level_0,age,age,age,age,age,age,age,age,balance,balance,...,balance_interval,balance_interval,pdays_interval,pdays_interval,pdays_interval,pdays_interval,pdays_interval,pdays_interval,pdays_interval,pdays_interval
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,527.0,40.11575,10.450493,19.0,32.0,38.0,47.0,80.0,527.0,1387.358634,...,0.0,4.0,527.0,0.003795,0.061546,0.0,0.0,0.0,0.0,1.0
1,215.0,42.488372,13.345117,18.0,33.0,39.0,51.0,93.0,215.0,1774.795349,...,0.0,3.0,215.0,0.018605,0.135439,0.0,0.0,0.0,0.0,1.0


In [50]:
# Define as métricas de avaliação
scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

region = 'duration_interval==2'

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_region = X_train.query(region)
y_region = y_train[X_region.index]

X_region_smote, y_region_smote = smote.fit_resample(X_region, y_region)

X_SMOTE_data = pd.concat([X_train.drop(X_region.index), pd.DataFrame(X_region_smote, columns=X_train.columns)])
y_SMOTE_data = pd.concat([y_train.drop(X_region.index), pd.Series(y_region_smote)])

start_time_model = time.time()
rf = RandomForestClassifier(max_depth=None, n_estimators=100)
rf.fit(X_SMOTE_data, y_SMOTE_data)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf, X_SMOTE_data, y_SMOTE_data, cv=skf, scoring=scoring)

rf.fit(X_SMOTE_data, y_SMOTE_data) 

y_val_pred = rf.predict(X_val)
y_val_proba = rf.predict_proba(X_val)

val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)

print("Size of new dataset:", len(X_SMOTE_data))
print("Time to train the model:", execution_time_model, "s")

print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))

Size of new dataset: 36261
Time to train the model: 9.068004608154297 s

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.90 (+/- 0.00)
Recall: 0.43 (+/- 0.02)
F1 Score: 0.52 (+/- 0.02)
Log-loss: 0.23 (+/- 0.01)

Results on the validation set:
Accuracy: 0.90
Log-loss: 0.24

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      7985
           1       0.63      0.44      0.52      1058

    accuracy                           0.90      9043
   macro avg       0.78      0.70      0.73      9043
weighted avg       0.89      0.90      0.90      9043



In [51]:
bank_val = pd.concat([X_val, y_val], axis=1, names=["age","job","marital","education","default","balance","housing","loan","contact","day",
        "month","duration","campaign","pdays","previous","poutcome","Target"])
region = bank_val.query('duration_interval==1')
region.groupby('Target').describe()

Unnamed: 0_level_0,age,age,age,age,age,age,age,age,balance,balance,...,balance_interval,balance_interval,pdays_interval,pdays_interval,pdays_interval,pdays_interval,pdays_interval,pdays_interval,pdays_interval,pdays_interval
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,2016.0,40.556052,10.167424,18.0,32.0,38.0,48.0,86.0,2016.0,1283.688988,...,0.0,5.0,2016.0,0.002976,0.054487,0.0,0.0,0.0,0.0,1.0
1,326.0,42.113497,14.194127,19.0,31.0,38.0,52.0,92.0,326.0,1742.242331,...,0.0,4.0,326.0,0.030675,0.1727,0.0,0.0,0.0,0.0,1.0


In [52]:
df_bank = pd.read_csv(
    "sets/bank/bank-full.csv",
        names=["age","job","marital","education","default","balance","housing","loan","contact","day",
        "month","duration","campaign","pdays","previous","poutcome","Target"],
        sep=r';',
        engine='python',
        na_values="?")

df_bank = df_bank.dropna()

# Encode categorical features
encoders = {}
for column in df_bank.columns:
    if df_bank.dtypes[column] == np.object_:
        le = LabelEncoder()
        df_bank[column] = le.fit_transform(df_bank[column])
        encoders[column] = le
        #print(column, le.classes_, le.transform(le.classes_))

X, y = df_bank[df_bank.columns.difference(["Target"])], df_bank["Target"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

bank_train = pd.concat([X_train, y_train], axis=1, names=["age","job","marital","education","default","balance","housing","loan","contact","day",
        "month","duration","campaign","pdays","previous","poutcome","Target"])
bank_val = pd.concat([X_val, y_val], axis=1, names=["age","job","marital","education","default","balance","housing","loan","contact","day",
        "month","duration","campaign","pdays","previous","poutcome","Target"])

bank_train['duration_interval'] = pd.cut(bank_train['duration'], bins=15, labels=False)

bank_train['balance_interval'] = pd.cut(bank_train['balance'], bins=15, labels=False)

bank_train['pdays_interval'] = pd.cut(bank_train['pdays'], bins=2, labels=False)
bank_train.drop(columns=["pdays"], inplace=True)

bank_val['duration_interval'] = pd.cut(bank_val['duration'], bins=15, labels=False)

bank_val['balance_interval'] = pd.cut(bank_val['balance'], bins=15, labels=False)

bank_val['pdays_interval'] = pd.cut(bank_val['pdays'], bins=2, labels=False)
bank_val.drop(columns=["pdays"], inplace=True)

y_train = bank_train['Target']
X_train = bank_train.drop(columns='Target')

y_val = bank_val['Target']
X_val = bank_val.drop(columns='Target')

scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

region = 'duration_interval==1'

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_region = X_train.query(region)  
y_region = y_train[X_region.index]

X_region_smote, y_region_smote = smote.fit_resample(X_region, y_region)

X_SMOTE_data = pd.concat([X_train.drop(X_region.index), pd.DataFrame(X_region_smote, columns=X_train.columns)])
y_SMOTE_data = pd.concat([y_train.drop(X_region.index), pd.Series(y_region_smote)])

start_time_model = time.time()
rf = RandomForestClassifier(max_depth=None, n_estimators=100)
rf.fit(X_SMOTE_data, y_SMOTE_data)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf, X_SMOTE_data, y_SMOTE_data, cv=skf, scoring=scoring)

rf.fit(X_SMOTE_data, y_SMOTE_data) 

y_val_pred = rf.predict(X_val)
y_val_proba = rf.predict_proba(X_val)

val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)

print("Size of new dataset:", len(X_SMOTE_data))
print("Time to train the model:", execution_time_model, "s")

print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))

Size of new dataset: 39706
Time to train the model: 10.687808752059937 s

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.91 (+/- 0.00)
Recall: 0.71 (+/- 0.01)
F1 Score: 0.75 (+/- 0.01)
Log-loss: 0.21 (+/- 0.01)

Results on the validation set:
Accuracy: 0.90
Log-loss: 0.25

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.93      0.95      0.94      7985
           1       0.58      0.50      0.54      1058

    accuracy                           0.90      9043
   macro avg       0.76      0.73      0.74      9043
weighted avg       0.89      0.90      0.90      9043

