In [1]:
import pandas as pd
import numpy as np
import pickle
from utils import *
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, log_loss
from sklearn.model_selection import StratifiedKFold, cross_validate
from slice_finder import SliceFinder
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [2]:
df_churn = pd.read_csv(
    "sets/churn/Churn_Modelling.csv",
        names=['RowNumber','CustomerId','Surname','CreditScore','Geography','Gender','Age',
        'Tenure','Balance','NumOfProducts','HasCrCard','IsActiveMember',
        'EstimatedSalary','Target'],
        sep=r'\s*,\s*',
        engine='python',
        na_values="?")

df_churn = df_churn.dropna()

# Encode categorical features
encoders = {}
for column in df_churn.columns:
    if df_churn.dtypes[column] == np.object_:
        le = LabelEncoder()
        df_churn[column] = le.fit_transform(df_churn[column])
        encoders[column] = le
        print(column, le.classes_, le.transform(le.classes_))

X, y = df_churn[df_churn.columns.difference(["Target"])], df_churn["Target"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

Surname ['Abazu' 'Abbie' 'Abbott' ... 'Zuev' 'Zuyev' 'Zuyeva'] [   0    1    2 ... 2929 2930 2931]
Geography ['France' 'Germany' 'Spain'] [0 1 2]
Gender ['Female' 'Male'] [0 1]


In [6]:
lr = RandomForestClassifier(max_depth=5, n_estimators=10)
lr.fit(X_train, y_train)

sf = SliceFinder(lr, (X_train, y_train))
metrics_all = sf.evaluate_model((X_train,y_train))
reference = (np.mean(metrics_all), np.std(metrics_all), len(metrics_all))

In [7]:
name = 'Churn.p'
recommendations = sf.find_slice(k=100, epsilon=0.4, degree=2, max_workers=4, name=name)

for s in recommendations:
    print ('\n=====================\nSlice description:')
    for k, v in list(s.filters.items()):
        values = ''
        if k in encoders:
            le = encoders[k]
            for v_ in v:
                values += '%s '%(le.inverse_transform(v_)[0])
        else:
            for v_ in sorted(v, key=lambda x: x[0]):
                if len(v_) > 1:
                    values += '%s ~ %s'%(v_[0], v_[1])
                else:
                    values += '%s '%(v_[0])
        print ('%s:%s'%(k, values))
    print ('---------------------\neffect_size: %s'%(s.effect_size))
    print ('---------------------\nmetric: %s'%(s.metric))
    print ('size: %s'%(s.size))

degree 1
crossing
effect size filtering
sorting

Slice description:
Age:50 
---------------------
effect_size: 0.41990960303341307
---------------------
metric: 0.6423518090685357
size: 105

Slice description:
Age:52 
---------------------
effect_size: 0.43080999572390316
---------------------
metric: 0.6459659994167303
size: 80

Slice description:
Surname:Meng 
---------------------
effect_size: 0.4371266748534072
---------------------
metric: 0.7163756670148079
size: 11

Slice description:
Surname:Sal 
---------------------
effect_size: 0.49263382241740744
---------------------
metric: 0.7490105282791089
size: 11

Slice description:
CreditScore:791 
---------------------
effect_size: 0.6727226167143298
---------------------
metric: 0.9179052514102425
size: 10

Slice description:
Surname:Shao 
---------------------
effect_size: 0.41120433179470733
---------------------
metric: 0.7147139927928258
size: 10

Slice description:
CreditScore:566 
---------------------
effect_size: 0.4125641

## Viewing regions identified for the Churn set

In [8]:
with open('./Slices/Churn.p', 'rb') as file:
    data = pickle.load(file)
    
    data = sorted(data, key=lambda d: d.size, reverse=True)
    for d in data:
        print(f'Description of the region:  {d.filters}')
        print(f'Region instances: {d.data_idx}')
        print(f'Region size:    {d.size}')
        print(f'Efect size: {d.effect_size}')
        print(f'Log Loss:    {d.metric}')
        print('\n')

Description of the region:  {'Age': [[np.int64(50)]]}
Region instances: Index([8850, 6295, 8833, 6621, 7662, 6968, 7018, 8299, 3838, 6982,
       ...
       7595, 2586, 9323, 1053, 2509, 4447, 4304, 3240, 6511, 4919],
      dtype='int64', length=105)
Region size:    105
Efect size: 0.41990960303341307
Log Loss:    0.6423518090685357


Description of the region:  {'Age': [[np.int64(52)]]}
Region instances: Index([2434, 5369, 1962, 9454, 1354, 8801, 5259, 4203, 6081, 2813, 9395, 2879,
       1947,  126, 6021, 1071, 5606, 2087, 6242, 7565, 1551, 6052, 4827, 6972,
       4182, 3930, 4127, 9207, 9106,  800, 6298, 9577, 1612, 7800, 9534, 4311,
       8755, 4523, 9078, 2524, 2898, 7949, 8520, 1066, 2510, 9498, 1338, 1520,
       8958, 9206, 4037,  936, 9692, 2872, 1005, 6308, 3637, 8422, 2915, 1664,
       1755, 9466, 7190,  400, 5659, 2745, 1494, 7803, 4579, 2414, 4477, 3683,
       8918, 6274, 6857, 1851, 2668, 9262, 6099, 1683],
      dtype='int64')
Region size:    80
Efect size: 0.4308099

In [9]:
churn_train = pd.concat([X_train, y_train], axis=1)
churn_train.groupby('Target').describe()

Unnamed: 0_level_0,Age,Age,Age,Age,Age,Age,Age,Age,Balance,Balance,...,Surname,Surname,Tenure,Tenure,Tenure,Tenure,Tenure,Tenure,Tenure,Tenure
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,6370.0,37.453689,10.175943,18.0,31.0,36.0,41.0,92.0,6370.0,72554.143826,...,2251.0,2931.0,6370.0,5.020094,2.879902,0.0,3.0,5.0,7.75,10.0
1,1630.0,44.780982,9.824076,18.0,38.0,45.0,51.0,84.0,1630.0,91337.278417,...,2195.75,2930.0,1630.0,5.003067,2.96273,0.0,2.0,5.0,8.0,10.0


# Evaluating performance

In [4]:
scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

start_time_model = time.time()
rf_optimized = model_train(X_train, y_train)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf_optimized, X_train, y_train, cv=skf, scoring=scoring)

rf_optimized.fit(X_train, y_train)

y_val_pred = rf_optimized.predict(X_val)
y_val_proba = rf_optimized.predict_proba(X_val)

val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)


print("Time to train the model:", execution_time_model, "s")

print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))


Time to train the model: 213.29904055595398 s

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.86 (+/- 0.01)
Recall: 0.43 (+/- 0.04)
F1 Score: 0.55 (+/- 0.04)
Log-loss: 0.35 (+/- 0.01)

Results on the validation set:
Accuracy: 0.86
Log-loss: 0.35

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.87      0.97      0.92      1593
           1       0.79      0.45      0.57       407

    accuracy                           0.86      2000
   macro avg       0.83      0.71      0.75      2000
weighted avg       0.86      0.86      0.85      2000



## Applying SMOTE to the entire set

In [11]:
scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

smote = SMOTE(sampling_strategy='auto', random_state=42)

y_slice = y_train
X_slice = X_train

X_slice_smote, y_slice_smote = smote.fit_resample(X_slice, y_slice)
region_SMOTE = (pd.concat([pd.DataFrame(X_slice_smote, columns=X_slice.columns), pd.Series(y_slice_smote, name='Target')], axis=1))

y_SMOTE_data = region_SMOTE['Target']
X_SMOTE_data = region_SMOTE.drop(columns='Target')

start_time_model = time.time()
rf = RandomForestClassifier(max_depth=None, n_estimators=100)
rf.fit(X_SMOTE_data, y_SMOTE_data)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf, X_SMOTE_data, y_SMOTE_data, cv=skf, scoring=scoring)


y_val_pred = rf.predict(X_val)
y_val_proba = rf.predict_proba(X_val)

val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)

print("Size of new dataset:", len(X_SMOTE_data))
print("Time to train the model:", execution_time_model, "s")

print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))

Size of new dataset: 12740
Time to train the model: 3.3344204425811768 s

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.87 (+/- 0.01)
Recall: 0.86 (+/- 0.01)
F1 Score: 0.87 (+/- 0.01)
Log-loss: 0.34 (+/- 0.02)

Results on the validation set:
Accuracy: 0.83
Log-loss: 0.43

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      1593
           1       0.57      0.61      0.59       407

    accuracy                           0.83      2000
   macro avg       0.73      0.75      0.74      2000
weighted avg       0.83      0.83      0.83      2000



### Data augmentation guided by problem regions

In [12]:
churn_train = pd.concat([X_train, y_train], axis=1)
region = churn_train.query('Age==50')
region.groupby('Target').describe()

Unnamed: 0_level_0,Age,Age,Age,Age,Age,Age,Age,Age,Balance,Balance,...,Surname,Surname,Tenure,Tenure,Tenure,Tenure,Tenure,Tenure,Tenure,Tenure
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,47.0,50.0,0.0,50.0,50.0,50.0,50.0,50.0,47.0,87437.011277,...,2030.5,2907.0,47.0,5.319149,2.695458,0.0,3.0,6.0,7.0,10.0
1,58.0,50.0,0.0,50.0,50.0,50.0,50.0,50.0,58.0,94798.973966,...,2017.25,2845.0,58.0,4.741379,2.929345,0.0,2.25,4.5,7.0,10.0


In [13]:
# Define evaluation metrics
scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

# Define the region
region = 'Age==50'

# Apply SMOTE in the region of interest
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_region = X_train.query(region)  # Select only samples from the region
y_region = y_train[X_region.index]

# Apply SMOTE to the selected region
X_region_smote, y_region_smote = smote.fit_resample(X_region, y_region)

# Combine the original dataset with the balanced one to form the new training set
X_SMOTE_data = pd.concat([X_train.drop(X_region.index), pd.DataFrame(X_region_smote, columns=X_train.columns)])
y_SMOTE_data = pd.concat([y_train.drop(X_region.index), pd.Series(y_region_smote)])

# Train the model on the training set
start_time_model = time.time()
rf = RandomForestClassifier(max_depth=None, n_estimators=100)
rf.fit(X_SMOTE_data, y_SMOTE_data)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

# Evaluation on the training set using cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf, X_SMOTE_data, y_SMOTE_data, cv=skf, scoring=scoring)

# Evaluation on the validation set
y_val_pred = rf.predict(X_val)
y_val_proba = rf.predict_proba(X_val)

# Results on the validation set
val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)

# Display results
print("Time to train the model:", execution_time_model, "seconds")
print("Size of the new dataset:", len(X_SMOTE_data))
# Training results (cross-validation)
print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

# Results on the validation set
print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))

Time to train the model: 1.7827081680297852 seconds
Size of the new dataset: 8011

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.86 (+/- 0.01)
Recall: 0.42 (+/- 0.04)
F1 Score: 0.54 (+/- 0.04)
Log-loss: 0.37 (+/- 0.02)

Results on the validation set:
Accuracy: 0.87
Log-loss: 0.35

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.87      0.97      0.92      1593
           1       0.81      0.45      0.58       407

    accuracy                           0.87      2000
   macro avg       0.84      0.71      0.75      2000
weighted avg       0.86      0.87      0.85      2000



In [14]:
churn_train = pd.concat([X_train, y_train], axis=1)
region = churn_train.query('Age==52')
region.groupby('Target').describe()

Unnamed: 0_level_0,Age,Age,Age,Age,Age,Age,Age,Age,Balance,Balance,...,Surname,Surname,Tenure,Tenure,Tenure,Tenure,Tenure,Tenure,Tenure,Tenure
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,29.0,52.0,0.0,52.0,52.0,52.0,52.0,52.0,29.0,68915.074828,...,2173.0,2907.0,29.0,5.0,2.725541,1.0,2.0,5.0,7.0,9.0
1,51.0,52.0,0.0,52.0,52.0,52.0,52.0,52.0,51.0,79001.897059,...,2229.5,2899.0,51.0,5.098039,3.061078,0.0,2.0,5.0,8.0,10.0


In [15]:
# Define evaluation metrics
scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

# Define the region
region = 'Age==52'

# Apply SMOTE in the region of interest
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_region = X_train.query(region)  # Select only samples from the region
y_region = y_train[X_region.index]

# Apply SMOTE to the selected region
X_region_smote, y_region_smote = smote.fit_resample(X_region, y_region)

# Combine the original dataset with the balanced one to form the new training set
X_SMOTE_data = pd.concat([X_train.drop(X_region.index), pd.DataFrame(X_region_smote, columns=X_train.columns)])
y_SMOTE_data = pd.concat([y_train.drop(X_region.index), pd.Series(y_region_smote)])

# Train the model on the training set
start_time_model = time.time()
rf = RandomForestClassifier(max_depth=None, n_estimators=100)
rf.fit(X_SMOTE_data, y_SMOTE_data)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

# Evaluation on the training set using cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf, X_SMOTE_data, y_SMOTE_data, cv=skf, scoring=scoring)

# Evaluation on the validation set
y_val_pred = rf.predict(X_val)
y_val_proba = rf.predict_proba(X_val)

# Results on the validation set
val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)

# Display results
print("Time to train the model:", execution_time_model, "seconds")
print("Size of the new dataset:", len(X_SMOTE_data))
# Training results (cross-validation)
print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

# Results on the validation set
print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))

Time to train the model: 1.910459041595459 seconds
Size of the new dataset: 8022

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.85 (+/- 0.01)
Recall: 0.41 (+/- 0.04)
F1 Score: 0.54 (+/- 0.03)
Log-loss: 0.36 (+/- 0.02)

Results on the validation set:
Accuracy: 0.87
Log-loss: 0.35

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.87      0.97      0.92      1593
           1       0.81      0.44      0.57       407

    accuracy                           0.87      2000
   macro avg       0.84      0.71      0.75      2000
weighted avg       0.86      0.87      0.85      2000



# Case Study

In [18]:
with open('Slices/Churn.p', 'rb') as file:
    data = pickle.load(file)
    
    name = []
    aux_1 = []
    aux_2 = []
    
    data = sorted(data, key=lambda d: d.size, reverse=True)
    for d in data:
        print ('\n=====================\nSlice description:')
        for k, v in list(d.filters.items()):
            values = ''
            if k in encoders:
                le = encoders[k]
                for v_ in v:
                    values += '%s '%(le.inverse_transform(v_)[0])
            else:
                for v_ in sorted(v, key=lambda x: x[0]):
                    if len(v_) > 1:
                        values += '%s ~ %s'%(v_[0], v_[1])
                    else:
                        values += '%s '%(v_[0])
            print ('%s:%s'%(k, values))
        #print(f'Description of the region:  {d.filters}')
        print(f'Region instances: {d.data_idx}')
        print(f'Region size:    {d.size}')
        print(f'Efect size: {d.effect_size}')
        print(f'Log Loss:    {d.metric}')
        print('\n')
        
        if d.size > 10:
            aux_1.append(d.size)
            aux_2.append(d.effect_size)


Slice description:
Age:50 
Region instances: Index([8850, 6295, 8833, 6621, 7662, 6968, 7018, 8299, 3838, 6982,
       ...
       7595, 2586, 9323, 1053, 2509, 4447, 4304, 3240, 6511, 4919],
      dtype='int64', length=105)
Region size:    105
Efect size: 0.41990960303341307
Log Loss:    0.6423518090685357



Slice description:
Age:52 
Region instances: Index([2434, 5369, 1962, 9454, 1354, 8801, 5259, 4203, 6081, 2813, 9395, 2879,
       1947,  126, 6021, 1071, 5606, 2087, 6242, 7565, 1551, 6052, 4827, 6972,
       4182, 3930, 4127, 9207, 9106,  800, 6298, 9577, 1612, 7800, 9534, 4311,
       8755, 4523, 9078, 2524, 2898, 7949, 8520, 1066, 2510, 9498, 1338, 1520,
       8958, 9206, 4037,  936, 9692, 2872, 1005, 6308, 3637, 8422, 2915, 1664,
       1755, 9466, 7190,  400, 5659, 2745, 1494, 7803, 4579, 2414, 4477, 3683,
       8918, 6274, 6857, 1851, 2668, 9262, 6099, 1683],
      dtype='int64')
Region size:    80
Efect size: 0.43080999572390316
Log Loss:    0.6459659994167303



Slice 

In [None]:
import json
from collections import Counter

# Função para converter tipos NumPy para tipos Python nativos
def convert_types(obj):
    if isinstance(obj, dict):
        return {k: convert_types(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_types(i) for i in obj]
    elif isinstance(obj, tuple):
        return tuple(convert_types(i) for i in obj)
    elif hasattr(obj, 'item'):  # np.int64, np.float64 etc.
        return obj.item()
    else:
        return obj

dicionario = {}

for d in data:
    if d.size > 1:
        filters_clean = convert_types(d.filters)
        filters_str = json.dumps(filters_clean, sort_keys=True)
        dicionario[filters_str] = []
        for i in d.data_idx:
            dicionario[filters_str].append(i)

feature_cont = Counter()

for feature in dicionario.keys():
    key_dict = json.loads(feature)
    for feature in key_dict.keys():
        feature_cont[feature] += 1

print(feature_cont)



Counter({'Surname': 159, 'CreditScore': 18, 'Age': 2})


In [26]:
df_churn = pd.read_csv(
    "sets/churn/Churn_Modelling.csv",
        names=['RowNumber','CustomerId','Surname','CreditScore','Geography','Gender','Age',
        'Tenure','Balance','NumOfProducts','HasCrCard','IsActiveMember',
        'EstimatedSalary','Target'],
        sep=r'\s*,\s*',
        engine='python',
        na_values="?")

df_churn = df_churn.dropna()
df_churn.drop(columns=["Surname"], inplace=True)

# Encode categorical features
encoders = {}
for column in df_churn.columns:
    if df_churn.dtypes[column] == np.object_:
        le = LabelEncoder()
        df_churn[column] = le.fit_transform(df_churn[column])
        encoders[column] = le
        print(column, le.classes_, le.transform(le.classes_))

X, y = df_churn[df_churn.columns.difference(["Target"])], df_churn["Target"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train

Geography ['France' 'Germany' 'Spain'] [0 1 2]
Gender ['Female' 'Male'] [0 1]


Unnamed: 0,Age,Balance,CreditScore,CustomerId,EstimatedSalary,Gender,Geography,HasCrCard,IsActiveMember,NumOfProducts,RowNumber,Tenure
2151,57,0.00,753,15588614,159475.08,1,0,1,0,1,2152,7
8392,32,102128.27,739,15641158,63981.37,1,1,1,0,1,8393,3
5006,37,113865.23,755,15750014,117396.25,0,1,1,1,2,5007,0
4117,37,0.00,561,15663964,83093.25,1,0,1,0,2,4118,5
7182,49,110540.43,692,15573520,107472.99,1,1,0,1,2,7183,6
...,...,...,...,...,...,...,...,...,...,...,...,...
4555,35,0.00,688,15733261,25488.43,0,2,1,0,1,4556,6
4644,74,0.00,712,15714240,151425.82,1,2,0,0,2,4645,5
8942,37,71786.90,667,15709780,67734.79,0,0,1,1,2,8943,9
2935,35,100988.39,687,15689425,22247.27,1,2,1,0,2,2936,8


In [27]:
lr = RandomForestClassifier(max_depth=5, n_estimators=10)
lr.fit(X_train, y_train)

sf = SliceFinder(lr, (X_train, y_train))
metrics_all = sf.evaluate_model((X_train,y_train))
reference = (np.mean(metrics_all), np.std(metrics_all), len(metrics_all))

name = 'Churn_corrigido.p'
recommendations = sf.find_slice(k=100, epsilon=0.4, degree=2, max_workers=4, name=name)

for s in recommendations:
    print ('\n=====================\nSlice description:')
    for k, v in list(s.filters.items()):
        values = ''
        if k in encoders:
            le = encoders[k]
            for v_ in v:
                values += '%s '%(le.inverse_transform(v_)[0])
        else:
            for v_ in sorted(v, key=lambda x: x[0]):
                if len(v_) > 1:
                    values += '%s ~ %s'%(v_[0], v_[1])
                else:
                    values += '%s '%(v_[0])
        print ('%s:%s'%(k, values))
    print ('---------------------\neffect_size: %s'%(s.effect_size))
    print ('---------------------\nmetric: %s'%(s.metric))
    print ('size: %s'%(s.size))

degree 1
crossing
effect size filtering
degree 2
crossing
effect size filtering
sorting

Slice description:
Geography:Germany 
NumOfProducts:1 
---------------------
effect_size: 0.40166146287754223
---------------------
metric: 0.6272282147937341
size: 1086

Slice description:
Age:43 
NumOfProducts:1 
---------------------
effect_size: 0.40604350217433166
---------------------
metric: 0.6636631932970292
size: 117

Slice description:
Age:50 
---------------------
effect_size: 0.43674035683484497
---------------------
metric: 0.6533869423934303
size: 105

Slice description:
Age:46 
NumOfProducts:1 
---------------------
effect_size: 0.4813318448898465
---------------------
metric: 0.6791167049903122
size: 98

Slice description:
Age:45 
NumOfProducts:1 
---------------------
effect_size: 0.4885181094514034
---------------------
metric: 0.6855781407164018
size: 91

Slice description:
Age:47 
NumOfProducts:1 
---------------------
effect_size: 0.5402965818983517
---------------------
metri

In [28]:
with open('Slices/Churn_corrigido.p', 'rb') as file:
    data = pickle.load(file)
    
    name = []
    aux_1 = []
    aux_2 = []
    
    data = sorted(data, key=lambda d: d.size, reverse=True)
    for d in data:
        print ('\n=====================\nSlice description:')
        for k, v in list(d.filters.items()):
            values = ''
            if k in encoders:
                le = encoders[k]
                for v_ in v:
                    values += '%s '%(le.inverse_transform(v_)[0])
            else:
                for v_ in sorted(v, key=lambda x: x[0]):
                    if len(v_) > 1:
                        values += '%s ~ %s'%(v_[0], v_[1])
                    else:
                        values += '%s '%(v_[0])
            print ('%s:%s'%(k, values))
        #print(f'Description of the region:  {d.filters}')
        print(f'Region instances: {d.data_idx}')
        print(f'Region size:    {d.size}')
        print(f'Efect size: {d.effect_size}')
        print(f'Log Loss:    {d.metric}')
        print('\n')
        
        if d.size > 10:
            aux_1.append(d.size)
            aux_2.append(d.effect_size)


Slice description:
Geography:Germany 
NumOfProducts:1 
Region instances: Index([8392, 8513, 4001, 2476, 8715, 3121, 8260, 7633, 4886, 4473,
       ...
       6650, 9017, 3595, 4274, 6038, 8577, 4771, 5704,  401,  127],
      dtype='int64', length=1086)
Region size:    1086
Efect size: 0.40166146287754223
Log Loss:    0.6272282147937341



Slice description:
Age:43 
NumOfProducts:1 
Region instances: Index([1507, 4075, 5944, 1442,   28, 7577, 6190, 8726, 2007, 8986,
       ...
          4, 4525, 5519, 3463, 3117, 7809, 8853, 3540, 1998, 3915],
      dtype='int64', length=117)
Region size:    117
Efect size: 0.40604350217433166
Log Loss:    0.6636631932970292



Slice description:
Age:50 
Region instances: Index([8850, 6295, 8833, 6621, 7662, 6968, 7018, 8299, 3838, 6982,
       ...
       7595, 2586, 9323, 1053, 2509, 4447, 4304, 3240, 6511, 4919],
      dtype='int64', length=105)
Region size:    105
Efect size: 0.43674035683484497
Log Loss:    0.6533869423934303



Slice description:


In [31]:
scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

start_time_model = time.time()
rf = RandomForestClassifier(max_depth=None, n_estimators=100)
rf.fit(X_train, y_train)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf, X_train, y_train, cv=skf, scoring=scoring)


y_val_pred = rf.predict(X_val)
y_val_proba = rf.predict_proba(X_val)

val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)


print("Time to train the model:", execution_time_model, "s")

print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))

Time to train the model: 1.9301912784576416 s

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.85 (+/- 0.01)
Recall: 0.42 (+/- 0.03)
F1 Score: 0.54 (+/- 0.03)
Log-loss: 0.36 (+/- 0.03)

Results on the validation set:
Accuracy: 0.87
Log-loss: 0.36

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.88      0.97      0.92      1593
           1       0.80      0.46      0.58       407

    accuracy                           0.87      2000
   macro avg       0.84      0.71      0.75      2000
weighted avg       0.86      0.87      0.85      2000



84, 71, 75
Geography ['France' 'Germany' 'Spain'] [0 1 2]
Gender ['Female' 'Male'] [0 1]

In [30]:
churn_train = pd.concat([X_train, y_train], axis=1)
region = churn_train.query('Geography==1 and NumOfProducts==1')
region.groupby('Target').describe()

Unnamed: 0_level_0,Age,Age,Age,Age,Age,Age,Age,Age,Balance,Balance,...,RowNumber,RowNumber,Tenure,Tenure,Tenure,Tenure,Tenure,Tenure,Tenure,Tenure
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,616.0,37.589286,10.969958,18.0,30.75,36.0,41.0,84.0,616.0,119328.733539,...,7497.0,9991.0,616.0,4.975649,2.870833,0.0,3.0,5.0,7.0,10.0
1,470.0,44.795745,9.577246,20.0,38.0,45.0,51.0,72.0,470.0,120489.548085,...,7491.25,9982.0,470.0,4.987234,3.032139,0.0,2.0,5.0,8.0,10.0


In [32]:
# Define evaluation metrics
scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

# Define the region
region = 'Geography==1 and NumOfProducts==1'

# Apply SMOTE in the region of interest
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_region = X_train.query(region)  # Select only samples from the region
y_region = y_train[X_region.index]

# Apply SMOTE to the selected region
X_region_smote, y_region_smote = smote.fit_resample(X_region, y_region)

# Combine the original dataset with the balanced one to form the new training set
X_SMOTE_data = pd.concat([X_train.drop(X_region.index), pd.DataFrame(X_region_smote, columns=X_train.columns)])
y_SMOTE_data = pd.concat([y_train.drop(X_region.index), pd.Series(y_region_smote)])

# Train the model on the training set
start_time_model = time.time()
rf = RandomForestClassifier(max_depth=None, n_estimators=100)
rf.fit(X_SMOTE_data, y_SMOTE_data)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

# Evaluation on the training set using cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf, X_SMOTE_data, y_SMOTE_data, cv=skf, scoring=scoring)

# Evaluation on the validation set
y_val_pred = rf.predict(X_val)
y_val_proba = rf.predict_proba(X_val)

# Results on the validation set
val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)

# Display results
print("Time to train the model:", execution_time_model, "seconds")
print("Size of the new dataset:", len(X_SMOTE_data))
# Training results (cross-validation)
print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

# Results on the validation set
print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))

Time to train the model: 1.8108258247375488 seconds
Size of the new dataset: 8146

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.86 (+/- 0.01)
Recall: 0.49 (+/- 0.02)
F1 Score: 0.60 (+/- 0.02)
Log-loss: 0.36 (+/- 0.03)

Results on the validation set:
Accuracy: 0.87
Log-loss: 0.35

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.88      0.97      0.92      1593
           1       0.79      0.47      0.59       407

    accuracy                           0.87      2000
   macro avg       0.83      0.72      0.75      2000
weighted avg       0.86      0.87      0.85      2000



In [33]:
churn_train = pd.concat([X_train, y_train], axis=1)
region = churn_train.query('Age==43 and NumOfProducts==1')
region.groupby('Target').describe()

Unnamed: 0_level_0,Age,Age,Age,Age,Age,Age,Age,Age,Balance,Balance,...,RowNumber,RowNumber,Tenure,Tenure,Tenure,Tenure,Tenure,Tenure,Tenure,Tenure
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,72.0,43.0,0.0,43.0,43.0,43.0,43.0,43.0,72.0,110929.760972,...,7685.5,9879.0,72.0,4.583333,2.910641,0.0,2.0,4.5,7.0,10.0
1,45.0,43.0,0.0,43.0,43.0,43.0,43.0,43.0,45.0,104394.175556,...,8536.0,9935.0,45.0,5.177778,3.518063,0.0,2.0,6.0,8.0,10.0


In [34]:
# Define evaluation metrics
scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

# Define the region
region = 'Age==43 and NumOfProducts==1'

# Apply SMOTE in the region of interest
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_region = X_train.query(region)  # Select only samples from the region
y_region = y_train[X_region.index]

# Apply SMOTE to the selected region
X_region_smote, y_region_smote = smote.fit_resample(X_region, y_region)

# Combine the original dataset with the balanced one to form the new training set
X_SMOTE_data = pd.concat([X_train.drop(X_region.index), pd.DataFrame(X_region_smote, columns=X_train.columns)])
y_SMOTE_data = pd.concat([y_train.drop(X_region.index), pd.Series(y_region_smote)])

# Train the model on the training set
start_time_model = time.time()
rf = RandomForestClassifier(max_depth=None, n_estimators=100)
rf.fit(X_SMOTE_data, y_SMOTE_data)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

# Evaluation on the training set using cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf, X_SMOTE_data, y_SMOTE_data, cv=skf, scoring=scoring)

# Evaluation on the validation set
y_val_pred = rf.predict(X_val)
y_val_proba = rf.predict_proba(X_val)

# Results on the validation set
val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)

# Display results
print("Time to train the model:", execution_time_model, "seconds")
print("Size of the new dataset:", len(X_SMOTE_data))
# Training results (cross-validation)
print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

# Results on the validation set
print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))

Time to train the model: 1.9604246616363525 seconds
Size of the new dataset: 8027

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.86 (+/- 0.01)
Recall: 0.45 (+/- 0.04)
F1 Score: 0.57 (+/- 0.04)
Log-loss: 0.37 (+/- 0.04)

Results on the validation set:
Accuracy: 0.86
Log-loss: 0.38

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.87      0.97      0.92      1593
           1       0.78      0.44      0.56       407

    accuracy                           0.86      2000
   macro avg       0.83      0.70      0.74      2000
weighted avg       0.85      0.86      0.85      2000



In [36]:
import json
from collections import Counter

# Função para converter tipos NumPy para tipos Python nativos
def convert_types(obj):
    if isinstance(obj, dict):
        return {k: convert_types(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_types(i) for i in obj]
    elif isinstance(obj, tuple):
        return tuple(convert_types(i) for i in obj)
    elif hasattr(obj, 'item'):  # np.int64, np.float64 etc.
        return obj.item()
    else:
        return obj

dicionario = {}

for d in data:
    if d.size > 1:
        filters_clean = convert_types(d.filters)
        filters_str = json.dumps(filters_clean, sort_keys=True)
        dicionario[filters_str] = []
        for i in d.data_idx:
            dicionario[filters_str].append(i)

feature_cont = Counter()

for feature in dicionario.keys():
    key_dict = json.loads(feature)
    for feature in key_dict.keys():
        feature_cont[feature] += 1

print(feature_cont)

Counter({'CreditScore': 1615, 'Age': 600, 'EstimatedSalary': 409, 'Tenure': 402, 'Balance': 312, 'Geography': 144, 'NumOfProducts': 104, 'IsActiveMember': 60, 'Gender': 53, 'HasCrCard': 44})


In [37]:
churn_train = pd.concat([X_train, y_train], axis=1)

churn_train.groupby('EstimatedSalary').describe()

Unnamed: 0_level_0,Age,Age,Age,Age,Age,Age,Age,Age,Balance,Balance,...,Tenure,Tenure,Target,Target,Target,Target,Target,Target,Target,Target
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
EstimatedSalary,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
11.58,1.0,45.0,,45.0,45.0,45.0,45.0,45.0,1.0,122917.71,...,4.0,4.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0
90.07,1.0,31.0,,31.0,31.0,31.0,31.0,31.0,1.0,0.00,...,5.0,5.0,1.0,0.0,,0.0,0.0,0.0,0.0,0.0
91.75,1.0,59.0,,59.0,59.0,59.0,59.0,59.0,1.0,121669.93,...,8.0,8.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0
106.67,1.0,40.0,,40.0,40.0,40.0,40.0,40.0,1.0,0.00,...,9.0,9.0,1.0,0.0,,0.0,0.0,0.0,0.0,0.0
123.07,1.0,52.0,,52.0,52.0,52.0,52.0,52.0,1.0,105355.81,...,10.0,10.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199862.75,1.0,40.0,,40.0,40.0,40.0,40.0,40.0,1.0,165298.12,...,3.0,3.0,1.0,0.0,,0.0,0.0,0.0,0.0,0.0
199909.32,1.0,28.0,,28.0,28.0,28.0,28.0,28.0,1.0,0.00,...,8.0,8.0,1.0,0.0,,0.0,0.0,0.0,0.0,0.0
199953.33,1.0,27.0,,27.0,27.0,27.0,27.0,27.0,1.0,153325.10,...,4.0,4.0,1.0,0.0,,0.0,0.0,0.0,0.0,0.0
199970.74,1.0,41.0,,41.0,41.0,41.0,41.0,41.0,1.0,98635.77,...,5.0,5.0,1.0,0.0,,0.0,0.0,0.0,0.0,0.0


In [16]:
df_churn = pd.read_csv(
    "sets/churn/Churn_Modelling.csv",
        names=['RowNumber','CustomerId','Surname','CreditScore','Geography','Gender','Age',
        'Tenure','Balance','NumOfProducts','HasCrCard','IsActiveMember',
        'EstimatedSalary','Target'],
        sep=r'\s*,\s*',
        engine='python',
        na_values="?")

df_churn = df_churn.dropna()
df_churn.drop(columns=["Surname"], inplace=True)
df_churn['Age_Binned'] = pd.cut(df_churn['Age'], 
                                 bins=[0, 20, 30, 40, 50, 60, 100], labels=['0-20', '21-30', '31-40', '41-50', '51-60', '60+'])
df_churn.drop(columns=["Age"], inplace=True)
df_churn['EstimatedSalary_interval'] = pd.cut(df_churn['EstimatedSalary'], bins=10, labels=False)
df_churn.drop(columns=["EstimatedSalary"], inplace=True)
df_churn['Balance_interval'] = pd.cut(df_churn['Balance'], bins=10, labels=False)
df_churn.drop(columns=["Balance"], inplace=True)

# Encode categorical features
encoders = {}
for column in df_churn.columns:
    if df_churn.dtypes[column] == np.object_ or str(df_churn.dtypes[column]) == 'category':
        le = LabelEncoder()
        df_churn[column] = le.fit_transform(df_churn[column])
        encoders[column] = le
        print(column, le.classes_, le.transform(le.classes_))

X, y = df_churn[df_churn.columns.difference(["Target"])], df_churn["Target"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train

Geography ['France' 'Germany' 'Spain'] [0 1 2]
Gender ['Female' 'Male'] [0 1]
Age_Binned ['0-20' '21-30' '31-40' '41-50' '51-60' '60+'] [0 1 2 3 4 5]


Unnamed: 0,Age_Binned,Balance_interval,CreditScore,CustomerId,EstimatedSalary_interval,Gender,Geography,HasCrCard,IsActiveMember,NumOfProducts,RowNumber,Tenure
2151,4,0,753,15588614,7,1,0,1,0,1,2152,7
8392,2,4,739,15641158,3,1,1,1,0,1,8393,3
5006,2,4,755,15750014,5,0,1,1,1,2,5007,0
4117,2,0,561,15663964,4,1,0,1,0,2,4118,5
7182,3,4,692,15573520,5,1,1,0,1,2,7183,6
...,...,...,...,...,...,...,...,...,...,...,...,...
4555,2,0,688,15733261,1,0,2,1,0,1,4556,6
4644,5,0,712,15714240,7,1,2,0,0,2,4645,5
8942,2,2,667,15709780,3,0,0,1,1,2,8943,9
2935,2,4,687,15689425,1,1,2,1,0,2,2936,8


In [17]:
scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

start_time_model = time.time()
rf = RandomForestClassifier(max_depth=None, n_estimators=100)
rf.fit(X_train, y_train)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf, X_train, y_train, cv=skf, scoring=scoring)


y_val_pred = rf.predict(X_val)
y_val_proba = rf.predict_proba(X_val)

val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)


print("Time to train the model:", execution_time_model, "s")

print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))

Time to train the model: 1.3745496273040771 s

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.85 (+/- 0.01)
Recall: 0.42 (+/- 0.04)
F1 Score: 0.53 (+/- 0.04)
Log-loss: 0.37 (+/- 0.02)

Results on the validation set:
Accuracy: 0.86
Log-loss: 0.37

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.87      0.97      0.92      1593
           1       0.80      0.42      0.55       407

    accuracy                           0.86      2000
   macro avg       0.83      0.70      0.74      2000
weighted avg       0.85      0.86      0.84      2000



In [18]:
lr = RandomForestClassifier(max_depth=5, n_estimators=10)
lr.fit(X_train, y_train)

sf = SliceFinder(lr, (X_train, y_train))
metrics_all = sf.evaluate_model((X_train,y_train))
reference = (np.mean(metrics_all), np.std(metrics_all), len(metrics_all))

name = 'Churn_corrigido_2.p'
recommendations = sf.find_slice(k=100, epsilon=0.4, degree=2, max_workers=4, name=name)

for s in recommendations:
    print ('\n=====================\nSlice description:')
    for k, v in list(s.filters.items()):
        values = ''
        if k in encoders:
            le = encoders[k]
            for v_ in v:
                values += '%s '%(le.inverse_transform(v_)[0])
        else:
            for v_ in sorted(v, key=lambda x: x[0]):
                if len(v_) > 1:
                    values += '%s ~ %s'%(v_[0], v_[1])
                else:
                    values += '%s '%(v_[0])
        print ('%s:%s'%(k, values))
    print ('---------------------\neffect_size: %s'%(s.effect_size))
    print ('---------------------\nmetric: %s'%(s.metric))
    print ('size: %s'%(s.size))

degree 1
crossing
effect size filtering
degree 2
crossing
effect size filtering
sorting

Slice description:
Geography:Germany 
NumOfProducts:1 
---------------------
effect_size: 0.41227970759973154
---------------------
metric: 0.6301986961514234
size: 1086

Slice description:
Age_Binned:41-50 
NumOfProducts:1 
---------------------
effect_size: 0.5096285505635453
---------------------
metric: 0.6504469429192972
size: 976

Slice description:
Age_Binned:51-60 
NumOfProducts:1 
---------------------
effect_size: 0.46495218874724414
---------------------
metric: 0.6457507168988317
size: 415

Slice description:
Age_Binned:51-60 
IsActiveMember:0 
---------------------
effect_size: 0.4197745924967703
---------------------
metric: 0.6287588138918091
size: 278

Slice description:
Age_Binned:60+ 
IsActiveMember:0 
---------------------
effect_size: 0.47291572857241887
---------------------
metric: 0.6846097713203962
size: 75

Slice description:
Balance_interval:2 
Geography:Spain 
-----------

Geography ['France' 'Germany' 'Spain'] [0 1 2]
Gender ['Female' 'Male'] [0 1]
Age_Binned ['0-20' '21-30' '31-40' '41-50' '51-60' '60+'] [0 1 2 3 4 5]

In [19]:
churn_train = pd.concat([X_train, y_train], axis=1)
region = churn_train.query('Geography==1 and NumOfProducts==1')
region.groupby('Target').describe()

Unnamed: 0_level_0,Age_Binned,Age_Binned,Age_Binned,Age_Binned,Age_Binned,Age_Binned,Age_Binned,Age_Binned,Balance_interval,Balance_interval,...,RowNumber,RowNumber,Tenure,Tenure,Tenure,Tenure,Tenure,Tenure,Tenure,Tenure
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,616.0,2.186688,1.055786,0.0,1.75,2.0,3.0,5.0,616.0,4.269481,...,7497.0,9991.0,616.0,4.975649,2.870833,0.0,3.0,5.0,7.0,10.0
1,470.0,2.908511,0.997936,0.0,2.0,3.0,4.0,5.0,470.0,4.306383,...,7491.25,9982.0,470.0,4.987234,3.032139,0.0,2.0,5.0,8.0,10.0


In [20]:
churn_train = pd.concat([X_train, y_train], axis=1)
region = churn_train.query('Age_Binned==3 and NumOfProducts==1')
region.groupby('Target').describe()

Unnamed: 0_level_0,Age_Binned,Age_Binned,Age_Binned,Age_Binned,Age_Binned,Age_Binned,Age_Binned,Age_Binned,Balance_interval,Balance_interval,...,RowNumber,RowNumber,Tenure,Tenure,Tenure,Tenure,Tenure,Tenure,Tenure,Tenure
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,550.0,3.0,0.0,3.0,3.0,3.0,3.0,3.0,550.0,3.649091,...,7144.0,9970.0,550.0,5.085455,2.882958,0.0,3.0,5.0,7.0,10.0
1,426.0,3.0,0.0,3.0,3.0,3.0,3.0,3.0,426.0,3.20892,...,7625.5,9982.0,426.0,4.941315,2.975007,0.0,2.0,5.0,8.0,10.0


In [21]:
churn_train = pd.concat([X_train, y_train], axis=1)
region = churn_train.query('Age_Binned==4 and NumOfProducts==1')
region.groupby('Target').describe()

Unnamed: 0_level_0,Age_Binned,Age_Binned,Age_Binned,Age_Binned,Age_Binned,Age_Binned,Age_Binned,Age_Binned,Balance_interval,Balance_interval,...,RowNumber,RowNumber,Tenure,Tenure,Tenure,Tenure,Tenure,Tenure,Tenure,Tenure
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,153.0,4.0,0.0,4.0,4.0,4.0,4.0,4.0,153.0,3.477124,...,7040.0,9910.0,153.0,4.934641,2.937223,0.0,2.0,5.0,8.0,10.0
1,262.0,4.0,0.0,4.0,4.0,4.0,4.0,4.0,262.0,3.251908,...,7156.25,9992.0,262.0,4.854962,2.942937,0.0,2.0,5.0,7.0,10.0


In [22]:
# Define evaluation metrics
scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

# Define the region
region = 'Geography==1 and NumOfProducts==1'

# Apply SMOTE in the region of interest
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_region = X_train.query(region)  # Select only samples from the region
y_region = y_train[X_region.index]

# Apply SMOTE to the selected region
X_region_smote, y_region_smote = smote.fit_resample(X_region, y_region)

# Combine the original dataset with the balanced one to form the new training set
X_SMOTE_data = pd.concat([X_train.drop(X_region.index), pd.DataFrame(X_region_smote, columns=X_train.columns)])
y_SMOTE_data = pd.concat([y_train.drop(X_region.index), pd.Series(y_region_smote)])

# Train the model on the training set
start_time_model = time.time()
rf = RandomForestClassifier(max_depth=None, n_estimators=100)
rf.fit(X_SMOTE_data, y_SMOTE_data)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

# Evaluation on the training set using cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf, X_SMOTE_data, y_SMOTE_data, cv=skf, scoring=scoring)

# Evaluation on the validation set
y_val_pred = rf.predict(X_val)
y_val_proba = rf.predict_proba(X_val)

# Results on the validation set
val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)

# Display results
print("Time to train the model:", execution_time_model, "seconds")
print("Size of the new dataset:", len(X_SMOTE_data))
# Training results (cross-validation)
print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

# Results on the validation set
print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))

Time to train the model: 1.4456136226654053 seconds
Size of the new dataset: 8146

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.85 (+/- 0.01)
Recall: 0.47 (+/- 0.03)
F1 Score: 0.58 (+/- 0.02)
Log-loss: 0.38 (+/- 0.03)

Results on the validation set:
Accuracy: 0.86
Log-loss: 0.39

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.87      0.97      0.92      1593
           1       0.77      0.44      0.56       407

    accuracy                           0.86      2000
   macro avg       0.82      0.71      0.74      2000
weighted avg       0.85      0.86      0.84      2000



In [23]:
# Define evaluation metrics
scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

# Define the region
region = 'Age_Binned==3 and NumOfProducts==1'

# Apply SMOTE in the region of interest
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_region = X_train.query(region)  # Select only samples from the region
y_region = y_train[X_region.index]

# Apply SMOTE to the selected region
X_region_smote, y_region_smote = smote.fit_resample(X_region, y_region)

# Combine the original dataset with the balanced one to form the new training set
X_SMOTE_data = pd.concat([X_train.drop(X_region.index), pd.DataFrame(X_region_smote, columns=X_train.columns)])
y_SMOTE_data = pd.concat([y_train.drop(X_region.index), pd.Series(y_region_smote)])

# Train the model on the training set
start_time_model = time.time()
rf = RandomForestClassifier(max_depth=None, n_estimators=100)
rf.fit(X_SMOTE_data, y_SMOTE_data)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

# Evaluation on the training set using cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf, X_SMOTE_data, y_SMOTE_data, cv=skf, scoring=scoring)

# Evaluation on the validation set
y_val_pred = rf.predict(X_val)
y_val_proba = rf.predict_proba(X_val)

# Results on the validation set
val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)

# Display results
print("Time to train the model:", execution_time_model, "seconds")
print("Size of the new dataset:", len(X_SMOTE_data))
# Training results (cross-validation)
print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

# Results on the validation set
print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))

Time to train the model: 1.4005446434020996 seconds
Size of the new dataset: 8124

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.85 (+/- 0.01)
Recall: 0.48 (+/- 0.04)
F1 Score: 0.58 (+/- 0.03)
Log-loss: 0.38 (+/- 0.03)

Results on the validation set:
Accuracy: 0.86
Log-loss: 0.36

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.87      0.97      0.92      1593
           1       0.77      0.45      0.57       407

    accuracy                           0.86      2000
   macro avg       0.82      0.71      0.74      2000
weighted avg       0.85      0.86      0.85      2000



In [26]:
# Define evaluation metrics
scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

# Define the region
region = 'Age_Binned==4 and NumOfProducts==1'

# Apply SMOTE in the region of interest
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_region = X_train.query(region)  # Select only samples from the region
y_region = y_train[X_region.index]

# Apply SMOTE to the selected region
X_region_smote, y_region_smote = smote.fit_resample(X_region, y_region)

# Combine the original dataset with the balanced one to form the new training set
X_SMOTE_data = pd.concat([X_train.drop(X_region.index), pd.DataFrame(X_region_smote, columns=X_train.columns)])
y_SMOTE_data = pd.concat([y_train.drop(X_region.index), pd.Series(y_region_smote)])

start_time_model = time.time()
rf_optimized = model_train(X_train, y_train)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf_optimized, X_train, y_train, cv=skf, scoring=scoring)

rf_optimized.fit(X_train, y_train)

y_val_pred = rf_optimized.predict(X_val)
y_val_proba = rf_optimized.predict_proba(X_val)

val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)

# Display results
print("Time to train the model:", execution_time_model, "seconds")
print("Size of the new dataset:", len(X_SMOTE_data))
# Training results (cross-validation)
print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

# Results on the validation set
print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))

Time to train the model: 169.03510689735413 seconds
Size of the new dataset: 8109

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.85 (+/- 0.01)
Recall: 0.42 (+/- 0.04)
F1 Score: 0.54 (+/- 0.04)
Log-loss: 0.37 (+/- 0.02)

Results on the validation set:
Accuracy: 0.86
Log-loss: 0.37

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.87      0.97      0.92      1593
           1       0.77      0.43      0.55       407

    accuracy                           0.86      2000
   macro avg       0.82      0.70      0.73      2000
weighted avg       0.85      0.86      0.84      2000



Time to train the model: 1.471707820892334 seconds
Size of the new dataset: 8109

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.86 (+/- 0.01)
Recall: 0.41 (+/- 0.03)
F1 Score: 0.54 (+/- 0.03)
Log-loss: 0.37 (+/- 0.02)

Results on the validation set:
Accuracy: 0.86
Log-loss: 0.41

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.87      0.97      0.92      1593
           1       0.81      0.42      0.55       407

    accuracy                           0.86      2000
   macro avg       0.84      0.70      0.74      2000
weighted avg       0.86      0.86      0.84      2000

# Combined regions

region = '(Geography==1 and NumOfProducts==1) & (Age_Binned==4 and NumOfProducts==1)'

In [12]:
# Define evaluation metrics
scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

# Define the region
region = '(Geography==1 and NumOfProducts==1) & (Age_Binned==4 and NumOfProducts==1)'

# Apply SMOTE in the region of interest
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_region = X_train.query(region)  # Select only samples from the region
y_region = y_train[X_region.index]

# Apply SMOTE to the selected region
X_region_smote, y_region_smote = smote.fit_resample(X_region, y_region)

# Combine the original dataset with the balanced one to form the new training set
X_SMOTE_data = pd.concat([X_train.drop(X_region.index), pd.DataFrame(X_region_smote, columns=X_train.columns)])
y_SMOTE_data = pd.concat([y_train.drop(X_region.index), pd.Series(y_region_smote)])

# Train the model on the training set
start_time_model = time.time()
rf = RandomForestClassifier(max_depth=None, n_estimators=100)
rf.fit(X_SMOTE_data, y_SMOTE_data)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

# Evaluation on the training set using cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf, X_SMOTE_data, y_SMOTE_data, cv=skf, scoring=scoring)

# Evaluation on the validation set
y_val_pred = rf.predict(X_val)
y_val_proba = rf.predict_proba(X_val)

# Results on the validation set
val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)

# Display results
print("Time to train the model:", execution_time_model, "seconds")
print("Size of the new dataset:", len(X_SMOTE_data))
# Training results (cross-validation)
print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

# Results on the validation set
print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))

Time to train the model: 264.65030097961426 seconds
Size of the new dataset: 8146

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.86 (+/- 0.01)
Recall: 0.44 (+/- 0.04)
F1 Score: 0.55 (+/- 0.03)
Log-loss: 0.36 (+/- 0.02)

Results on the validation set:
Accuracy: 0.86
Log-loss: 0.35

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.87      0.97      0.92      1593
           1       0.77      0.45      0.57       407

    accuracy                           0.86      2000
   macro avg       0.82      0.71      0.74      2000
weighted avg       0.85      0.86      0.85      2000



In [13]:
with open('Slices/Churn_corrigido_2.p', 'rb') as file:
    data = pickle.load(file)
    
    name = []
    aux_1 = []
    aux_2 = []
    
    data = sorted(data, key=lambda d: d.size, reverse=True)
    for d in data:
        print ('\n=====================\nSlice description:')
        for k, v in list(d.filters.items()):
            values = ''
            if k in encoders:
                le = encoders[k]
                for v_ in v:
                    values += '%s '%(le.inverse_transform(v_)[0])
            else:
                for v_ in sorted(v, key=lambda x: x[0]):
                    if len(v_) > 1:
                        values += '%s ~ %s'%(v_[0], v_[1])
                    else:
                        values += '%s '%(v_[0])
            print ('%s:%s'%(k, values))
        #print(f'Description of the region:  {d.filters}')
        print(f'Region instances: {d.data_idx}')
        print(f'Region size:    {d.size}')
        print(f'Efect size: {d.effect_size}')
        print(f'Log Loss:    {d.metric}')
        print('\n')
        
        if d.size > 10:
            aux_1.append(d.size)
            aux_2.append(d.effect_size)


Slice description:
Geography:Germany 
NumOfProducts:1 
Region instances: Index([8392, 8513, 4001, 2476, 8715, 3121, 8260, 7633, 4886, 4473,
       ...
       6650, 9017, 3595, 4274, 6038, 8577, 4771, 5704,  401,  127],
      dtype='int64', length=1086)
Region size:    1086
Efect size: 0.4240448689762856
Log Loss:    0.6419464616159829



Slice description:
Age_Binned:41-50 
NumOfProducts:1 
Region instances: Index([7387, 1222, 1572, 8948,  578, 6696, 6422, 9884, 6775, 8715,
       ...
       9969, 2416,  685, 6650, 8585, 4274, 7536, 5704, 8007, 6423],
      dtype='int64', length=976)
Region size:    976
Efect size: 0.4919182508602711
Log Loss:    0.660466223457229



Slice description:
Age_Binned:41-50 
Geography:Germany 
Region instances: Index([7182, 1319, 7650, 5085, 8630, 8715, 7107, 2222, 7633,  409,
       ...
       4520, 6232, 1714, 6228, 9494, 3915, 7765, 6650, 4274, 5704],
      dtype='int64', length=533)
Region size:    533
Efect size: 0.44305391548245127
Log Loss:    0.649

In [14]:
import json
from collections import Counter

# Função para converter tipos NumPy para tipos Python nativos
def convert_types(obj):
    if isinstance(obj, dict):
        return {k: convert_types(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_types(i) for i in obj]
    elif isinstance(obj, tuple):
        return tuple(convert_types(i) for i in obj)
    elif hasattr(obj, 'item'):  # np.int64, np.float64 etc.
        return obj.item()
    else:
        return obj

dicionario = {}

for d in data:
    if d.size > 1:
        filters_clean = convert_types(d.filters)
        filters_str = json.dumps(filters_clean, sort_keys=True)
        dicionario[filters_str] = []
        for i in d.data_idx:
            dicionario[filters_str].append(i)

feature_cont = Counter()

for feature in dicionario.keys():
    key_dict = json.loads(feature)
    for feature in key_dict.keys():
        feature_cont[feature] += 1

print(feature_cont)

Counter({'CreditScore': 1489, 'EstimatedSalary_interval': 370, 'Tenure': 334, 'Balance_interval': 280, 'Age_Binned': 203, 'Geography': 121, 'NumOfProducts': 83, 'Gender': 42, 'IsActiveMember': 42, 'HasCrCard': 38})


In [15]:
churn_train = pd.concat([X_train, y_train], axis=1)

churn_train.groupby('EstimatedSalary_interval').describe()

Unnamed: 0_level_0,Age_Binned,Age_Binned,Age_Binned,Age_Binned,Age_Binned,Age_Binned,Age_Binned,Age_Binned,Balance_interval,Balance_interval,...,Tenure,Tenure,Target,Target,Target,Target,Target,Target,Target,Target
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
EstimatedSalary_interval,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,538.0,2.273234,1.036426,0.0,2.0,2.0,3.0,5.0,538.0,3.875465,...,7.0,10.0,538.0,0.204461,0.403683,0.0,0.0,0.0,0.0,1.0
1,515.0,2.330097,1.045542,0.0,2.0,2.0,3.0,5.0,515.0,4.359223,...,8.0,10.0,515.0,0.217476,0.41293,0.0,0.0,0.0,0.0,1.0
2,524.0,2.364504,1.091756,0.0,2.0,2.0,3.0,5.0,524.0,4.089695,...,7.25,10.0,524.0,0.187023,0.390303,0.0,0.0,0.0,0.0,1.0
3,522.0,2.302682,1.115392,0.0,2.0,2.0,3.0,5.0,522.0,4.283525,...,7.0,10.0,522.0,0.180077,0.38462,0.0,0.0,0.0,0.0,1.0
4,547.0,2.330896,1.028092,0.0,2.0,2.0,3.0,5.0,547.0,4.316271,...,7.0,10.0,547.0,0.191956,0.394199,0.0,0.0,0.0,0.0,1.0
5,548.0,2.324818,0.969538,0.0,2.0,2.0,3.0,5.0,548.0,4.385036,...,8.0,10.0,548.0,0.187956,0.391034,0.0,0.0,0.0,0.0,1.0
6,565.0,2.410619,1.009473,0.0,2.0,2.0,3.0,5.0,565.0,4.099115,...,7.0,10.0,565.0,0.187611,0.390747,0.0,0.0,0.0,0.0,1.0
7,539.0,2.309833,1.012246,0.0,2.0,2.0,3.0,5.0,539.0,4.25974,...,8.0,10.0,539.0,0.183673,0.387577,0.0,0.0,0.0,0.0,1.0
8,545.0,2.385321,1.070221,0.0,2.0,2.0,3.0,5.0,545.0,4.385321,...,7.0,10.0,545.0,0.227523,0.419618,0.0,0.0,0.0,0.0,1.0
9,545.0,2.293578,1.065008,0.0,2.0,2.0,3.0,5.0,545.0,4.251376,...,8.0,10.0,545.0,0.212844,0.409694,0.0,0.0,0.0,0.0,1.0


# Region Remove

In [3]:
churn_train = pd.concat([X_train, y_train], axis=1)

# Define evaluation metrics
scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

# Define the region for removal
region = 'Age==50'

# Identify and remove samples from the specified region
region_remove_idx = churn_train.query(region).index
churn_train.drop(region_remove_idx, errors='ignore', inplace=True)

# Ensure that the training set columns remain consistent
X_slice = churn_train.drop(columns=['Target'])
y_slice = churn_train['Target']

# Reindex the validation set to ensure consistency with the training set
X_val = X_val.reindex(columns=X_slice.columns, fill_value=0)

# Train the model on the training set
start_time_model = time.time()
rf_optimized = model_train(X_slice, y_slice)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

# Evaluation on the training set using cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf_optimized, X_slice, y_slice, cv=skf, scoring=scoring)

# Train the model on the complete training data
rf_optimized.fit(X_slice, y_slice)

# Evaluation on the validation set
y_val_pred = rf_optimized.predict(X_val)
y_val_proba = rf_optimized.predict_proba(X_val)

# Results on the validation set
val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)

# Display results
print("Time to train the model:", execution_time_model, "seconds")
print("Size of the dataset after removal:", len(X_slice))
# Training results (cross-validation)
print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

# Results on the validation set
print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))

Time to train the model: 215.258282661438 seconds
Size of the dataset after removal: 7895

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.86 (+/- 0.01)
Recall: 0.41 (+/- 0.04)
F1 Score: 0.54 (+/- 0.04)
Log-loss: 0.35 (+/- 0.02)

Results on the validation set:
Accuracy: 0.86
Log-loss: 0.35

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.87      0.97      0.92      1593
           1       0.81      0.42      0.56       407

    accuracy                           0.86      2000
   macro avg       0.84      0.70      0.74      2000
weighted avg       0.86      0.86      0.84      2000



In [4]:
churn_train = pd.concat([X_train, y_train], axis=1)

# Define evaluation metrics
scoring = {
    'accuracy': 'accuracy',  
    'recall': 'recall',      
    'f1': 'f1',              
    'log_loss': make_scorer(log_loss, response_method='predict_proba')
}

# Define the region for removal
region = 'Age==52'

# Identify and remove samples from the specified region
region_remove_idx = churn_train.query(region).index
churn_train.drop(region_remove_idx, errors='ignore', inplace=True)

# Ensure that the training set columns remain consistent
X_slice = churn_train.drop(columns=['Target'])
y_slice = churn_train['Target']

# Reindex the validation set to ensure consistency with the training set
X_val = X_val.reindex(columns=X_slice.columns, fill_value=0)

# Train the model on the training set
start_time_model = time.time()
rf_optimized = model_train(X_slice, y_slice)
end_time_model = time.time()
execution_time_model = end_time_model - start_time_model

# Evaluation on the training set using cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_train = cross_validate(rf_optimized, X_slice, y_slice, cv=skf, scoring=scoring)

# Train the model on the complete training data
rf_optimized.fit(X_slice, y_slice)

# Evaluation on the validation set
y_val_pred = rf_optimized.predict(X_val)
y_val_proba = rf_optimized.predict_proba(X_val)

# Results on the validation set
val_accuracy = np.mean(y_val_pred == y_val)
val_log_loss = log_loss(y_val, y_val_proba)

# Display results
print("Time to train the model:", execution_time_model, "seconds")
print("Size of the dataset after removal:", len(X_slice))
# Training results (cross-validation)
print("\nMean and standard deviation of metrics in the training set (cross-validation):")
print(f"Accuracy: {np.mean(results_train['test_accuracy']):.2f} (+/- {np.std(results_train['test_accuracy']):.2f})")
print(f"Recall: {np.mean(results_train['test_recall']):.2f} (+/- {np.std(results_train['test_recall']):.2f})")
print(f"F1 Score: {np.mean(results_train['test_f1']):.2f} (+/- {np.std(results_train['test_f1']):.2f})")
print(f"Log-loss: {np.mean(results_train['test_log_loss']):.2f} (+/- {np.std(results_train['test_log_loss']):.2f})")

# Results on the validation set
print("\nResults on the validation set:")
print(f"Accuracy: {val_accuracy:.2f}")
print(f"Log-loss: {val_log_loss:.2f}")
print("\nClassification report on the validation set:")
print(classification_report(y_val, y_val_pred))

Time to train the model: 163.11366438865662 seconds
Size of the dataset after removal: 7920

Mean and standard deviation of metrics in the training set (cross-validation):
Accuracy: 0.86 (+/- 0.01)
Recall: 0.41 (+/- 0.03)
F1 Score: 0.54 (+/- 0.03)
Log-loss: 0.35 (+/- 0.01)

Results on the validation set:
Accuracy: 0.86
Log-loss: 0.35

Classification report on the validation set:
              precision    recall  f1-score   support

           0       0.87      0.97      0.92      1593
           1       0.79      0.43      0.56       407

    accuracy                           0.86      2000
   macro avg       0.83      0.70      0.74      2000
weighted avg       0.85      0.86      0.84      2000

