In [124]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

PARAM = {}
PARAM['semilla'] = 700027


# Set the working directory (you should replace the path with your local directory)
import os
os.chdir("/Users/ailicic/Documents/itba_dm")

# Load the dataset
dataset = pd.read_csv("./datasets/dataset_pequeno.csv")

# Filter dataset to work only with rows where clase_ternaria is not empty
dataset = dataset[dataset['clase_ternaria'].notna()]


  dataset = pd.read_csv("./datasets/dataset_pequeno.csv")


In [125]:
# active_quarter
# cliente_vip
# internet
# ccaja_seguridad
# tcallcenter
# thomebanking
# tmobile_app
# Master_delinquency
# Master_status
# Visa_delinquency
# Visa_status


#Transform categorical variables with one hot encoding
dataset = pd.get_dummies(dataset, columns=['active_quarter', 'cliente_vip', 'internet', 'ccaja_seguridad', 'tcallcenter', 'thomebanking', 'tmobile_app', 'Master_delinquency', 'Master_status', 'Visa_delinquency', 'Visa_status'])



In [135]:

# Stratified partition of the dataset
x_train, x_test, y_train, y_test = train_test_split(dataset.drop(columns=['clase_ternaria']),
                                                    dataset['clase_ternaria'],
                                                    test_size=0.3,
                                                    random_state=PARAM['semilla'],
                                                    stratify=dataset['clase_ternaria'])

#add fold column to dataset, it's 1 for training and 2 for testing
dataset['fold'] = np.where(dataset.index.isin(x_train.index), 1, 2)
# Define your basic parameters


modelo = DecisionTreeClassifier(criterion="gini", 
                                splitter="best", 
                                max_depth=40, 
                                min_samples_split=800, 
                                min_samples_leaf=20,
                                random_state = PARAM['semilla'],
                                ccp_alpha=0.0001,
                                class_weight="balanced")
modelo.fit(x_train, y_train)

# Predict probabilities for each class
prediccion = modelo.predict_proba(x_test)
#print the order of the classes
# Convert the predicted probabilities to a DataFrame
prediccion_df = pd.DataFrame(prediccion, columns=modelo.classes_)

# Add the "ganancia" column based on the "clase_ternaria" values
dataset['ganancia'] = np.where(dataset['clase_ternaria'] == 'BAJA+2', 117000, -3000)

#para testing agrego la probabilidad
dataset.loc[dataset['fold'] == 2, 'prob_baja2'] = prediccion_df['BAJA+2'].values

# calculo la ganancia en testing que es fold==2
ganancia_test = dataset.loc[(dataset['fold'] == 2) & (dataset['prob_baja2'] > 0.025), 'ganancia'].sum()

# escalo la ganancia como si fuera todo el dataset
ganancia_test_normalizada = ganancia_test / 0.3

estimulos = dataset.loc[(dataset['fold'] == 2) & (dataset['prob_baja2'] > 0.025), 'prob_baja2'].count()
aciertos = dataset.loc[(dataset['fold'] == 2) & (dataset['prob_baja2'] > 0.025) & (dataset['clase_ternaria'] == "BAJA+2"), 'clase_ternaria'].count()

print("Testing total: ", dataset.loc[dataset['fold'] == 2, 'fold'].count())
print("Testing BAJA+2: ", dataset.loc[(dataset['fold'] == 2) & (dataset['clase_ternaria'] == "BAJA+2"), 'clase_ternaria'].count())
print("Estimulos: ", estimulos)
print("Aciertos (BAJA+2): ", aciertos)
print("Ganancia en testing: ", ganancia_test)
print("Ganancia en testing (normalizada): ", ganancia_test_normalizada)

Testing total:  49405
Testing BAJA+2:  381
Estimulos:  19267
Aciertos (BAJA+2):  152
Ganancia en testing:  -39561000
Ganancia en testing (normalizada):  -131870000.0


In [110]:
dataset

Unnamed: 0,numero_de_cliente,foto_mes,active_quarter,cliente_vip,internet,cliente_edad,cliente_antiguedad,mrentabilidad,mrentabilidad_annual,mcomisiones,...,Visa_mpagosdolares,Visa_fechaalta,Visa_mconsumototal,Visa_cconsumos,Visa_cadelantosefectivo,Visa_mpagominimo,clase_ternaria,fold,ganancia,prob_baja2
0,31116053,202107,1,0,0,50,201,13073.36,161035.99,3163.52,...,0.00,6060.0,121585.39,75.0,0.0,77617.41,CONTINUA,1,-3000,
1,31116803,202107,1,0,0,59,326,2640.48,56751.51,5232.22,...,17.59,4211.0,21596.40,7.0,0.0,4938.33,CONTINUA,1,-3000,
2,31117730,202107,1,0,0,68,272,1343.51,20201.31,582.94,...,0.00,7951.0,1529.02,2.0,0.0,1407.60,CONTINUA,1,-3000,
3,31117908,202107,1,0,0,80,326,5523.19,55827.28,2712.26,...,0.00,7702.0,12815.99,8.0,0.0,856.29,CONTINUA,1,-3000,
4,31117977,202107,1,0,0,60,261,4819.35,37386.40,1878.40,...,12.89,7949.0,157186.12,27.0,0.0,9548.22,CONTINUA,2,-3000,0.002821
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164677,214033382,202107,1,0,0,33,1,307.11,307.11,353.00,...,0.00,19.0,0.00,0.0,0.0,668.61,CONTINUA,2,-3000,0.021752
164678,214034971,202107,0,0,0,32,1,161.22,161.22,0.00,...,,11.0,,,,0.00,CONTINUA,1,-3000,
164679,214037429,202107,0,0,0,42,1,0.00,0.00,0.00,...,,16.0,,,,0.00,CONTINUA,1,-3000,
164680,214042186,202107,0,0,0,24,1,0.01,0.01,0.00,...,,3.0,,,,0.00,CONTINUA,1,-3000,


In [91]:
#add gridsearch to the code in the upper cell
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

# Define the parameters to evaluate, use known well-performing values
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [8, 10, 12, 14, 16],
    'min_samples_leaf': [10, 20, 30, 40, 50],
    'min_samples_split': [400, 500, 600, 700, 800],
    'min_impurity_decrease': [0.00001, 0.0001, 0.001, 0.01, 0.1]
    }

# Define the metric to optimize (in this case, ganancia_test)
scoring = {'ganancia_test': make_scorer(lambda y_true, y_pred: (y_true == 'BAJA+2').sum() * 117000 - 3000 * ((y_pred == 'BAJA+2') & (y_true != y_pred)).sum())}

# Create the grid search model
grid_search = GridSearchCV(modelo, param_grid, scoring=scoring, refit='ganancia_test',
                            cv=5, return_train_score=True, n_jobs=-1)

# Fit grid search to the data
grid_search.fit(x_train, y_train)

# View the results
print("Mejores parámetros: " + str(grid_search.best_params_))
print("Mejor ganancia_test: " + str(grid_search.best_score_))
print("Mejor modelo: " + str(grid_search.best_estimator_))



KeyboardInterrupt: 

In [None]:
#Above function in Python
def ArbolEstimarGanancia(semilla, param_basicos):

    prediccion = modelo.predict_proba(x_test)
    prediccion_df = pd.DataFrame(prediccion, columns=modelo.classes_)

    # calculo la ganancia en testing  qu es fold==2
    ganancia_test = dataset.loc[(dataset['fold'] == 2) & (dataset['prob_baja2'] > 0.025), 'ganancia'].sum()

    # escalo la ganancia como si fuera todo el dataset
    ganancia_test_normalizada = ganancia_test / 0.3

    return(ganancia_test_normalizada)
