In [1]:
import duckdb
import pandas as pd
import numpy as np
from src.model.utils import train_model
from src.preprocess.etl import get_dataframe
from src.model.inference import predict_month
from src.constants import PATH_DATABASE, MONTHS_BASELINE, MONTHS_INFERENCE, PARAMS, PATH_FINAL


import warnings

warnings.filterwarnings("ignore")

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
con = duckdb.connect(database=PATH_DATABASE, read_only=True)

In [4]:
all_months = MONTHS_BASELINE + MONTHS_INFERENCE
all_months = [str(month) for month in all_months]
where_clause = ", ".join(all_months)
where_clause

'202007, 202008, 202009, 202010, 202011, 202012, 202101, 202102, 202103, 202104, 202105, 202106, 202107, 202108, 202109'

In [5]:
df = con.sql(
    f"""
    SELECT
        *
    FROM read_parquet('{PATH_FINAL}')
    WHERE foto_mes IN ({where_clause})
    """
).to_df()

In [6]:
df.head()

Unnamed: 0,numero_de_cliente,foto_mes,active_quarter,cliente_vip,internet,cliente_edad,cliente_antiguedad,mrentabilidad,mrentabilidad_annual,mcomisiones,...,Visa_mconsumototal,Visa_cconsumos,Visa_cadelantosefectivo,Visa_mpagominimo,clase_ternaria_202104,clase_ternaria_202105,clase_ternaria_202106,clase_ternaria_202107,clase_ternaria_202108,clase_ternaria_202109
0,86787796,202109,1,0,0,62,170,398.33,13229.56,453.73,...,9944.15,5.0,0.0,950.13,,,,,,BAJA+1
1,86788155,202109,1,0,0,40,170,1889.5,10379.18,2001.33,...,5177.03,3.0,0.0,598.23,,,,,,BAJA+1
2,86790136,202109,1,0,0,61,170,965.88,7587.93,1396.01,...,11131.17,10.0,0.0,1853.34,,,,,,BAJA+1
3,86791400,202109,1,0,0,35,170,5114.99,82662.36,1044.59,...,26033.78,8.0,0.0,4316.64,,,,,,BAJA+1
4,86791878,202109,1,0,0,42,170,-6128.26,-125656.61,3352.77,...,81208.53,4.0,0.0,18709.35,,,,,,BAJA+1


In [7]:
np.sort(df["foto_mes"].unique())

array([202007, 202008, 202009, 202010, 202011, 202012, 202101, 202102,
       202103, 202104, 202105, 202106, 202107, 202108, 202109])

In [8]:
drop_cols = [f"clase_ternaria_{i}" for i in MONTHS_INFERENCE]
drop_cols.append("clase_binaria")
drop_cols

['clase_ternaria_202104',
 'clase_ternaria_202105',
 'clase_ternaria_202106',
 'clase_ternaria_202107',
 'clase_ternaria_202108',
 'clase_ternaria_202109',
 'clase_binaria']

In [9]:
ground_truth = df[["numero_de_cliente","foto_mes", "clase_ternaria_202109"]]
ground_truth = ground_truth[ground_truth["foto_mes"]]
ground_truth

Unnamed: 0,numero_de_cliente,foto_mes,clase_ternaria_202109
46435,73139652,202105,CONTINUA
46436,73569089,202105,CONTINUA
46437,73785768,202105,CONTINUA
46438,73804974,202105,CONTINUA
46439,73918262,202105,CONTINUA
...,...,...,...
2425602,83503713,202104,CONTINUA
2425603,83505182,202104,CONTINUA
2425604,83505370,202104,CONTINUA
2425605,83506856,202104,CONTINUA


In [10]:
ground_truth["ganancia"] = ground_truth["clase_ternaria_202109"].map({"BAJA+2":270000, "BAJA+1":-7000, "CONTINUA":-7000})
ground_truth.head()

Unnamed: 0,numero_de_cliente,foto_mes,clase_ternaria_202109,ganancia
46435,73139652,202105,CONTINUA,-7000
46436,73569089,202105,CONTINUA,-7000
46437,73785768,202105,CONTINUA,-7000
46438,73804974,202105,CONTINUA,-7000
46439,73918262,202105,CONTINUA,-7000


In [11]:
for month in MONTHS_INFERENCE:
    print("Pred month:", month)
    tags_monthly = {
        'stage': 'monthly',
        'last_month': month,
    }

    
    df["clase_binaria"] = df[f"clase_ternaria_{month}"].map({"BAJA+2":1, "BAJA+1":1, "CONTINUA":0})
    
    df_test = df[df["foto_mes"].isin([month])].copy()
    df_train = df[df["foto_mes"] <= int(month - 2)].copy()

    print("Training months:", df_train["foto_mes"].unique())
    
    df_test = df_test.reset_index(drop=True)
    df_train = df_train.reset_index(drop=True)
    
    X_train = df_train.drop(columns=drop_cols, axis=1).copy()
    X_test = df_test.drop(columns=drop_cols, axis=1).copy()
    
    y_train_binaria = df_train["clase_binaria"].copy()
    
    y_real_ternaria = df_test["clase_ternaria_202109"].copy().map({"BAJA+2":1, "BAJA+1":0, "CONTINUA":0})
    y_real_binaria = df_test["clase_ternaria_202109"].copy().map({"BAJA+2":1, "BAJA+1":1, "CONTINUA":0})

    if month < 202108:
        model = train_model(X_train, y_train_binaria, X_test, y_real_ternaria, y_real_binaria, "monthly", PARAMS, tags_monthly)
    else:
        model = train_model(X_train, y_train_binaria, None, None, None, "monthly", PARAMS, tags_monthly)

    preds = X_test[["numero_de_cliente", "foto_mes"]].copy()
    preds["prediction"] = predict_month(model, X_test)
    preds = preds.merge(ground_truth, on=["numero_de_cliente", "foto_mes"])

    preds = preds.sort_values(["prediction"], ascending=False)
    
    gan_mes = preds["prediction"] * preds["ganancia"]
    print("Ganancia", month, gan_mes.sum())
    
    preds.to_csv(f"~/buckets/b1/exp_colab/datasets/processed/predictions/monthly/preds_{month}.csv", index=False)

Pred month: 202104
Training months: [202010 202007 202011 202012 202102 202009 202008 202101]


2023/11/16 03:01:41 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/11/16 03:01:41 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2023/11/16 03:01:43 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/11/16 03:01:43 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2023/11/16 03:01:50 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/11/16 03:01:50 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2023/11/16 03:01:51 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/11/16 03:01:51 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...


Ganancia 202104 -26682829.987446956
Pred month: 202105
Training months: [202010 202103 202007 202011 202012 202102 202009 202008 202101]


2023/11/16 03:04:17 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/11/16 03:04:17 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2023/11/16 03:04:19 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/11/16 03:04:19 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2023/11/16 03:04:26 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/11/16 03:04:26 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2023/11/16 03:04:27 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/11/16 03:04:28 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...


Ganancia 202105 -68116833.9529853
Pred month: 202106
Training months: [202104 202010 202103 202007 202011 202012 202102 202009 202008 202101]


2023/11/16 03:07:15 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/11/16 03:07:16 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2023/11/16 03:07:17 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/11/16 03:07:17 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2023/11/16 03:07:24 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/11/16 03:07:24 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2023/11/16 03:07:26 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/11/16 03:07:26 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...


Ganancia 202106 -33036604.44739762
Pred month: 202107
Training months: [202105 202104 202010 202103 202007 202011 202012 202102 202009 202008
 202101]


2023/11/16 03:10:25 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/11/16 03:10:25 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2023/11/16 03:10:27 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/11/16 03:10:27 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2023/11/16 03:10:34 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/11/16 03:10:34 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2023/11/16 03:10:36 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/11/16 03:10:36 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...


Ganancia 202107 8358806.407094183
Pred month: 202108
Training months: [202105 202104 202010 202103 202007 202011 202012 202106 202102 202009
 202008 202101]




Ganancia 202108 0.0
Pred month: 202109
Training months: [202105 202104 202010 202103 202007 202011 202012 202106 202102 202107
 202009 202008 202101]




Ganancia 202109 0.0


<Figure size 1050x700 with 0 Axes>

## Sin `foto_mes`

In [12]:
if True:
    drop_cols.append("foto_mes")

In [13]:
for month in MONTHS_INFERENCE:
    print("Pred month:", month)
    tags_monthly = {
        'stage': 'monthly_sin_foto_mes',
        'last_month': month,
    }

    
    df["clase_binaria"] = df[f"clase_ternaria_{month}"].map({"BAJA+2":1, "BAJA+1":1, "CONTINUA":0})
    
    df_test = df[df["foto_mes"].isin([month])].copy()
    df_train = df[df["foto_mes"] <= int(month - 2)].copy()

    print("Training months:", df_train["foto_mes"].unique())
    
    df_test = df_test.reset_index(drop=True)
    df_train = df_train.reset_index(drop=True)
    
    X_train = df_train.drop(columns=drop_cols, axis=1).copy()
    X_test = df_test.drop(columns=drop_cols, axis=1).copy()
    
    y_train_binaria = df_train["clase_binaria"].copy()
    
    y_real_ternaria = df_test["clase_ternaria_202109"].copy().map({"BAJA+2":1, "BAJA+1":0, "CONTINUA":0})
    y_real_binaria = df_test["clase_ternaria_202109"].copy().map({"BAJA+2":1, "BAJA+1":1, "CONTINUA":0})

    if month < 202108:
        model = train_model(X_train, y_train_binaria, X_test, y_real_ternaria, y_real_binaria, "monthly", PARAMS, tags_monthly)
    else:
        model = train_model(X_train, y_train_binaria, None, None, None, "monthly", PARAMS, tags_monthly)

    preds = X_test.copy()
    preds["foto_mes"] = month
    preds = preds[["numero_de_cliente", "foto_mes"]]
    preds["prediction"] = predict_month(model, X_test)
    preds = preds.merge(ground_truth, on=["numero_de_cliente", "foto_mes"])
    
    preds = preds.sort_values(["prediction"], ascending=False)
    
    gan_mes = preds["prediction"] * preds["ganancia"]
    print("Ganancia", month, gan_mes.sum())
    
    preds.to_csv(f"~/buckets/b1/exp_colab/datasets/processed/predictions/monthly/preds_{month}_sin_foto_mes.csv", index=False)

Pred month: 202104
Training months: [202010 202007 202011 202012 202102 202009 202008 202101]


2023/11/16 03:19:15 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/11/16 03:19:15 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2023/11/16 03:19:16 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/11/16 03:19:16 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2023/11/16 03:19:23 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/11/16 03:19:23 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2023/11/16 03:19:25 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/11/16 03:19:25 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...


Ganancia 202104 3839658.91075169
Pred month: 202105
Training months: [202010 202103 202007 202011 202012 202102 202009 202008 202101]


2023/11/16 03:21:47 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/11/16 03:21:48 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2023/11/16 03:21:49 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/11/16 03:21:49 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2023/11/16 03:21:56 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/11/16 03:21:56 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2023/11/16 03:21:58 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/11/16 03:21:58 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...


Ganancia 202105 -53408418.306852356
Pred month: 202106
Training months: [202104 202010 202103 202007 202011 202012 202102 202009 202008 202101]


2023/11/16 03:24:49 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/11/16 03:24:49 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2023/11/16 03:24:50 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/11/16 03:24:50 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2023/11/16 03:24:58 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/11/16 03:24:58 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2023/11/16 03:25:00 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/11/16 03:25:00 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...


Ganancia 202106 -23667872.512992013
Pred month: 202107
Training months: [202105 202104 202010 202103 202007 202011 202012 202102 202009 202008
 202101]


2023/11/16 03:28:09 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/11/16 03:28:09 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2023/11/16 03:28:11 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/11/16 03:28:11 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2023/11/16 03:28:19 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/11/16 03:28:19 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2023/11/16 03:28:21 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/11/16 03:28:21 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...


Ganancia 202107 21080420.805758704
Pred month: 202108
Training months: [202105 202104 202010 202103 202007 202011 202012 202106 202102 202009
 202008 202101]




Ganancia 202108 0.0
Pred month: 202109
Training months: [202105 202104 202010 202103 202007 202011 202012 202106 202102 202107
 202009 202008 202101]




Ganancia 202109 0.0


<Figure size 1050x700 with 0 Axes>