In [1]:
import duckdb
import pandas as pd
import numpy as np
from src.model.utils import train_model
from src.preprocess.etl import get_dataframe
from src.model.inference import predict_month
from src.monitoring.adversarial import train_adversarial
from src.constants import PATH_DATABASE, MONTHS_BASELINE, MONTHS_INFERENCE, PARAMS


import warnings

warnings.filterwarnings("ignore")

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
con = duckdb.connect(database=PATH_DATABASE, read_only=True)

In [4]:
all_months = MONTHS_BASELINE + MONTHS_INFERENCE
all_months = [str(month) for month in all_months]
where_clause = ", ".join(all_months)
where_clause

'202007, 202008, 202009, 202010, 202011, 202012, 202101, 202102, 202103, 202104, 202105, 202106, 202107, 202108, 202109'

In [5]:
df = con.sql(
    f"""
    SELECT
        *
    FROM competencia_03
    WHERE foto_mes IN ({where_clause})
    """
).to_df()

In [6]:
df.head()

Unnamed: 0,numero_de_cliente,foto_mes,active_quarter,cliente_vip,internet,cliente_edad,cliente_antiguedad,mrentabilidad,mrentabilidad_annual,mcomisiones,...,Visa_mconsumototal,Visa_cconsumos,Visa_cadelantosefectivo,Visa_mpagominimo,clase_ternaria_202104,clase_ternaria_202105,clase_ternaria_202106,clase_ternaria_202107,clase_ternaria_202108,clase_ternaria_202109
0,70761293,202009,1,0,1,44,26,507.2,25042.65,257.97,...,,,,0.0,CONTINUA,CONTINUA,CONTINUA,CONTINUA,CONTINUA,CONTINUA
1,70764581,202009,1,0,1,44,170,643.54,-2348.36,610.74,...,5297.83,8.0,0.0,0.0,CONTINUA,CONTINUA,CONTINUA,CONTINUA,CONTINUA,CONTINUA
2,70764666,202009,1,0,1,44,188,-373.53,18174.3,606.58,...,10670.59,9.0,0.0,1771.23,CONTINUA,CONTINUA,CONTINUA,CONTINUA,CONTINUA,CONTINUA
3,70765905,202009,1,0,1,43,102,-2445.17,1088.7,306.72,...,15238.34,11.0,0.0,1008.78,CONTINUA,CONTINUA,CONTINUA,CONTINUA,CONTINUA,CONTINUA
4,70767758,202009,0,0,1,43,119,229.91,6141.71,62.63,...,,,,0.0,CONTINUA,CONTINUA,CONTINUA,CONTINUA,CONTINUA,CONTINUA


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2425607 entries, 0 to 2425606
Columns: 160 entries, numero_de_cliente to clase_ternaria_202109
dtypes: float64(91), int64(63), object(6)
memory usage: 2.9+ GB


In [8]:
np.sort(df["foto_mes"].unique())

array([202007, 202008, 202009, 202010, 202011, 202012, 202101, 202102,
       202103, 202104, 202105, 202106, 202107, 202108, 202109])

In [9]:
drop_cols = [f"clase_ternaria_{i}" for i in MONTHS_INFERENCE]
drop_cols.append("clase_binaria")
drop_cols

['clase_ternaria_202104',
 'clase_ternaria_202105',
 'clase_ternaria_202106',
 'clase_ternaria_202107',
 'clase_ternaria_202108',
 'clase_ternaria_202109',
 'clase_binaria']

In [10]:
ground_truth = df[["numero_de_cliente","foto_mes", "clase_ternaria_202109"]]
ground_truth = ground_truth[ground_truth["foto_mes"]<=202107]
ground_truth

Unnamed: 0,numero_de_cliente,foto_mes,clase_ternaria_202109
0,70761293,202009,CONTINUA
1,70764581,202009,CONTINUA
2,70764666,202009,CONTINUA
3,70765905,202009,CONTINUA
4,70767758,202009,CONTINUA
...,...,...,...
2425602,68284360,202010,CONTINUA
2425603,69736551,202010,CONTINUA
2425604,69184218,202010,CONTINUA
2425605,69341118,202010,CONTINUA


In [11]:
ground_truth["ganancia"] = ground_truth["clase_ternaria_202109"].map({"BAJA+2":270000, "BAJA+1":-7000, "CONTINUA":-7000})
ground_truth.head()

Unnamed: 0,numero_de_cliente,foto_mes,clase_ternaria_202109,ganancia
0,70761293,202009,CONTINUA,-7000
1,70764581,202009,CONTINUA,-7000
2,70764666,202009,CONTINUA,-7000
3,70765905,202009,CONTINUA,-7000
4,70767758,202009,CONTINUA,-7000


In [12]:
training_months = sorted(MONTHS_BASELINE)
training_months = training_months[0:-1]

In [13]:
# df["clase_binaria"] = df[f"clase_ternaria_202104"].map({"BAJA+2":1, "BAJA+1":1, "CONTINUA":0})

# df_train = df[df["foto_mes"].isin(training_months)].copy()

# print("Training months:", df_train["foto_mes"].unique())

# df_train = df_train.reset_index(drop=True)

# X_train = df_train.drop(columns=drop_cols, axis=1).copy()
# y_train_binaria = df_train["clase_binaria"].copy()

# for month in MONTHS_INFERENCE:
#     print("Pred month:", month)
#     tags_monthly = {
#         'stage': 'baseline',
#         'last_month': month,
#     }
#     df_test = df[df["foto_mes"].isin([month])].copy()
#     df_test = df_test.reset_index(drop=True)
#     X_test = df_test.drop(columns=drop_cols, axis=1).copy()
#     y_real_ternaria = df_test["clase_ternaria_202109"].copy().map({"BAJA+2":1, "BAJA+1":0, "CONTINUA":0})
#     y_real_binaria = df_test["clase_ternaria_202109"].copy().map({"BAJA+2":1, "BAJA+1":1, "CONTINUA":0})

#     tags_adv = {
#         'stage': 'adversarial_only_last',
#         'last_month': month,
#     }
#     model = train_adversarial(X_train, X_test, "adversarial", tags_adv)
    
#     if month < 202108:
#         model = train_model(X_train, y_train_binaria, X_test, y_real_ternaria, y_real_binaria, "baseline", PARAMS, tags_monthly)
#     else:
#         model = train_model(X_train, y_train_binaria, None, None, None, "baseline", PARAMS, tags_monthly)
    
#     preds = X_test[["numero_de_cliente", "foto_mes"]].copy()
#     preds["prediction"] = predict_month(model, X_test)
#     preds = preds.merge(ground_truth, on=["numero_de_cliente", "foto_mes"])
    
#     preds = preds.sort_values(["prediction"], ascending=False)
    
#     gan_mes = preds["prediction"] * preds["ganancia"]
#     print("Ganancia", month, gan_mes.sum())
    
#     preds.to_csv(f"~/buckets/b1/exp_colab/datasets/processed/predictions/baseline/preds_{month}.csv", index=False)

## Sin `foto_mes`

In [14]:
if True:
    drop_cols.append("foto_mes")

In [16]:
df["clase_binaria"] = df[f"clase_ternaria_202104"].map({"BAJA+2":1, "BAJA+1":1, "CONTINUA":0})

df_train = df[df["foto_mes"].isin(training_months)].copy()

print("Training months:", df_train["foto_mes"].unique())

df_train = df_train.reset_index(drop=True)

X_train = df_train.drop(columns=drop_cols, axis=1).copy()
y_train_binaria = df_train["clase_binaria"].copy()

for month in MONTHS_INFERENCE:
    print("Pred month:", month)
    tags_monthly = {
        'stage': 'baseline_sin_foto_mes',
        'last_month': month,
    }
    df_test = df[df["foto_mes"].isin([month])].copy()
    df_test = df_test.reset_index(drop=True)
    X_test = df_test.drop(columns=drop_cols, axis=1).copy()
    y_real_ternaria = df_test["clase_ternaria_202109"].copy().map({"BAJA+2":1, "BAJA+1":0, "CONTINUA":0})
    y_real_binaria = df_test["clase_ternaria_202109"].copy().map({"BAJA+2":1, "BAJA+1":1, "CONTINUA":0})

    tags_adv = {
        'stage': 'adversarial_only_last_sin_foto_mes',
        'last_month': month,
    }
    model = train_adversarial(X_train, X_test, "adversarial", tags_adv)
    
    if month < 202108:
        model = train_model(X_train, y_train_binaria, X_test, y_real_ternaria, y_real_binaria, "baseline", PARAMS, tags_monthly)
    else:
        model = train_model(X_train, y_train_binaria, None, None, None, "baseline", PARAMS, tags_monthly)

    preds = X_test.copy()
    preds["foto_mes"] = month
    preds = preds[["numero_de_cliente", "foto_mes"]]
    preds["prediction"] = predict_month(model, X_test)
    preds = preds.merge(ground_truth, on=["numero_de_cliente", "foto_mes"])
    
    preds = preds.sort_values(["prediction"], ascending=False)
    
    gan_mes = preds["prediction"] * preds["ganancia"]
    print("Ganancia", month, gan_mes.sum())
    
    preds.to_csv(f"~/buckets/b1/exp_colab/datasets/processed/predictions/baseline/preds_{month}_sin_foto_mes.csv", index=False)

Training months: [202009 202010 202007 202008 202102 202011 202012 202101]
Pred month: 202104


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
2023/11/16 02:38:22 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/11/16 02:38:22 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2023/11/16 02:38:24 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/11/16 02:38:24 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2023/11/16 02:38:30 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/11/16 02:38:30 INFO mlflow.models.evaluation.d

Ganancia 202104 6244336.8123177625
Pred month: 202105


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
2023/11/16 02:41:48 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/11/16 02:41:48 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2023/11/16 02:41:49 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/11/16 02:41:50 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2023/11/16 02:41:56 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/11/16 02:41:56 INFO mlflow.models.evaluation.d

Ganancia 202105 -54223336.64267073
Pred month: 202106


2023/11/16 02:45:04 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/11/16 02:45:04 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2023/11/16 02:45:05 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/11/16 02:45:05 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2023/11/16 02:45:12 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/11/16 02:45:12 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2023/11/16 02:45:13 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/11/16 02:45:13 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...


Ganancia 202106 -25977515.195275493
Pred month: 202107


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
2023/11/16 02:48:22 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/11/16 02:48:22 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2023/11/16 02:48:23 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/11/16 02:48:23 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2023/11/16 02:48:30 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/11/16 02:48:30 INFO mlflow.models.evaluation.d

Ganancia 202107 14550665.211737614
Pred month: 202108




Ganancia 202108 0.0
Pred month: 202109


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Ganancia 202109 0.0


<Figure size 1050x700 with 0 Axes>