In [1]:
import duckdb
import pandas as pd
import numpy as np
from src.model.utils import train_model
from src.preprocess.etl import get_dataframe
from src.model.inference import predict_month
from src.constants import PATH_DATABASE, MONTHS_BASELINE, MONTHS_INFERENCE, PARAMS


import warnings

warnings.filterwarnings("ignore")



In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
con = duckdb.connect(database=PATH_DATABASE, read_only=True)

In [4]:
all_months = MONTHS_BASELINE + MONTHS_INFERENCE
all_months = [str(month) for month in all_months]
where_clause = ", ".join(all_months)
where_clause

'202007, 202008, 202009, 202010, 202011, 202012, 202101, 202102, 202103, 202104, 202105, 202106, 202107, 202108, 202109'

In [5]:
df = con.sql(
    f"""
    SELECT
        *
    FROM competencia_03
    WHERE foto_mes IN ({where_clause})
    """
).to_df()

In [6]:
df.head()

Unnamed: 0,numero_de_cliente,foto_mes,active_quarter,cliente_vip,internet,cliente_edad,cliente_antiguedad,mrentabilidad,mrentabilidad_annual,mcomisiones,...,Visa_mconsumototal,Visa_cconsumos,Visa_cadelantosefectivo,Visa_mpagominimo,clase_ternaria_202104,clase_ternaria_202105,clase_ternaria_202106,clase_ternaria_202107,clase_ternaria_202108,clase_ternaria_202109
0,142625636,202010,1,0,0,47,47,252.69,-9620.93,848.93,...,122798.76,15.0,0.0,35588.82,CONTINUA,CONTINUA,CONTINUA,CONTINUA,CONTINUA,CONTINUA
1,142625782,202010,1,0,0,39,47,414.63,9078.79,703.87,...,12469.69,3.0,0.0,1642.2,CONTINUA,CONTINUA,CONTINUA,CONTINUA,CONTINUA,CONTINUA
2,142628454,202010,1,0,0,30,47,-203.57,-6326.79,215.09,...,77609.86,17.0,0.0,48386.25,CONTINUA,CONTINUA,CONTINUA,CONTINUA,CONTINUA,CONTINUA
3,142628779,202010,1,0,0,38,47,2520.37,34297.42,460.78,...,1299.29,2.0,0.0,7131.84,CONTINUA,CONTINUA,CONTINUA,CONTINUA,CONTINUA,CONTINUA
4,142631486,202010,1,0,0,33,47,-208.88,-11204.17,-503.2,...,9100.74,17.0,0.0,2463.3,CONTINUA,CONTINUA,CONTINUA,CONTINUA,CONTINUA,CONTINUA


In [7]:
np.sort(df["foto_mes"].unique())

array([202007, 202008, 202009, 202010, 202011, 202012, 202101, 202102,
       202103, 202104, 202105, 202106, 202107, 202108, 202109])

In [8]:
drop_cols = [f"clase_ternaria_{i}" for i in MONTHS_INFERENCE]
drop_cols.append("clase_binaria")
drop_cols

['clase_ternaria_202104',
 'clase_ternaria_202105',
 'clase_ternaria_202106',
 'clase_ternaria_202107',
 'clase_ternaria_202108',
 'clase_ternaria_202109',
 'clase_binaria']

In [9]:
ground_truth = df[["numero_de_cliente","foto_mes", "clase_ternaria_202109"]]
ground_truth = ground_truth[ground_truth["foto_mes"]<=202107]
ground_truth

Unnamed: 0,numero_de_cliente,foto_mes,clase_ternaria_202109
0,142625636,202010,CONTINUA
1,142625782,202010,CONTINUA
2,142628454,202010,CONTINUA
3,142628779,202010,CONTINUA
4,142631486,202010,CONTINUA
...,...,...,...
2425602,80867796,202106,CONTINUA
2425603,80868026,202106,CONTINUA
2425604,80868240,202106,CONTINUA
2425605,80869734,202106,CONTINUA


In [10]:
ground_truth["ganancia"] = ground_truth["clase_ternaria_202109"].map({"BAJA+2":270000, "BAJA+1":-7000, "CONTINUA":-7000})
ground_truth.head()

Unnamed: 0,numero_de_cliente,foto_mes,clase_ternaria_202109,ganancia
0,142625636,202010,CONTINUA,-7000
1,142625782,202010,CONTINUA,-7000
2,142628454,202010,CONTINUA,-7000
3,142628779,202010,CONTINUA,-7000
4,142631486,202010,CONTINUA,-7000


In [11]:
training_months = sorted(MONTHS_BASELINE)
training_months = training_months[0:-1]

In [12]:
df["clase_binaria"] = df[f"clase_ternaria_202104"].map({"BAJA+2":1, "BAJA+1":1, "CONTINUA":0})

df_train = df[df["foto_mes"].isin(training_months)].copy()

print("Training months:", df_train["foto_mes"].unique())

df_train = df_train.reset_index(drop=True)

X_train = df_train.drop(columns=drop_cols, axis=1).copy()
y_train_binaria = df_train["clase_binaria"].copy()

for month in MONTHS_INFERENCE:
    print("Pred month:", month)
    tags_monthly = {
        'stage': 'baseline',
        'last_month': month,
    }
    df_test = df[df["foto_mes"].isin([month])].copy()
    df_test = df_test.reset_index(drop=True)
    X_test = df_test.drop(columns=drop_cols, axis=1).copy()
    y_real_ternaria = df_test["clase_ternaria_202109"].copy().map({"BAJA+2":1, "BAJA+1":0, "CONTINUA":0})
    y_real_binaria = df_test["clase_ternaria_202109"].copy().map({"BAJA+2":1, "BAJA+1":1, "CONTINUA":0})
    if month < 202108:
        model = train_model(X_train, y_train_binaria, X_test, y_real_ternaria, y_real_binaria, "baseline", PARAMS, tags_monthly)
    else:
        model = train_model(X_train, y_train_binaria, None, None, None, "baseline", PARAMS, tags_monthly)
    
    preds = X_test[["numero_de_cliente", "foto_mes"]].copy()
    preds["prediction"] = predict_month(model, X_test)
    preds = preds.merge(ground_truth, on=["numero_de_cliente", "foto_mes"])
    
    preds = preds.sort_values(["prediction"], ascending=False)
    
    gan_mes = preds["prediction"] * preds["ganancia"]
    print("Ganancia", month, gan_mes.sum())
    
    preds.to_csv(f"~/buckets/b1/exp_colab/datasets/processed/predictions/baseline/preds_{month}.csv", index=False)

Training months: [202010 202011 202101 202102 202012 202007 202009 202008]
Pred month: 202104


2023/11/15 21:50:35 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/11/15 21:50:35 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2023/11/15 21:50:36 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/11/15 21:50:36 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2023/11/15 21:50:43 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/11/15 21:50:43 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2023/11/15 21:50:44 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/11/15 21:50:45 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...


Ganancia 202104 -19717176.126806542
Pred month: 202105


2023/11/15 21:52:29 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/11/15 21:52:29 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2023/11/15 21:52:30 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/11/15 21:52:31 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2023/11/15 21:52:37 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/11/15 21:52:38 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2023/11/15 21:52:39 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/11/15 21:52:39 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...


Ganancia 202105 -86448886.31952368
Pred month: 202106


2023/11/15 21:54:21 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/11/15 21:54:22 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2023/11/15 21:54:23 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/11/15 21:54:23 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2023/11/15 21:54:30 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/11/15 21:54:30 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2023/11/15 21:54:32 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/11/15 21:54:32 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...


Ganancia 202106 -46078446.61384146
Pred month: 202107


2023/11/15 21:56:16 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/11/15 21:56:16 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2023/11/15 21:56:18 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/11/15 21:56:18 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2023/11/15 21:56:25 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/11/15 21:56:25 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2023/11/15 21:56:26 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/11/15 21:56:27 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...


Ganancia 202107 385674.7992496299
Pred month: 202108




Ganancia 202108 0.0
Pred month: 202109




Ganancia 202109 0.0


<Figure size 1050x700 with 0 Axes>