In [1]:
import duckdb
import pandas as pd
import numpy as np
from src.preprocess.etl import get_dataframe
from src.monitoring.adversarial import train_adversarial
from src.constants import PATH_DATABASE, MONTHS_BASELINE, MONTHS_INFERENCE, PARAMS


import warnings

warnings.filterwarnings("ignore")

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
con = duckdb.connect(database=PATH_DATABASE, read_only=True)

In [4]:
all_months = MONTHS_BASELINE + MONTHS_INFERENCE
all_months = [str(month) for month in all_months]
where_clause = ", ".join(all_months)
where_clause

'202007, 202008, 202009, 202010, 202011, 202012, 202101, 202102, 202103, 202104, 202105, 202106, 202107, 202108, 202109'

In [5]:
df = con.sql(
    f"""
    SELECT
        *
    FROM competencia_03
    WHERE foto_mes IN ({where_clause})
    """
).to_df()

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2425607 entries, 0 to 2425606
Columns: 160 entries, numero_de_cliente to clase_ternaria_202109
dtypes: float64(91), int64(63), object(6)
memory usage: 2.9+ GB


In [7]:
df.head()

Unnamed: 0,numero_de_cliente,foto_mes,active_quarter,cliente_vip,internet,cliente_edad,cliente_antiguedad,mrentabilidad,mrentabilidad_annual,mcomisiones,...,Visa_mconsumototal,Visa_cconsumos,Visa_cadelantosefectivo,Visa_mpagominimo,clase_ternaria_202104,clase_ternaria_202105,clase_ternaria_202106,clase_ternaria_202107,clase_ternaria_202108,clase_ternaria_202109
0,170787102,202104,1,0,0,37,20,-2288.11,-10051.97,185.33,...,18379.95,3.0,0.0,3155.37,BAJA+1,BAJA+2,CONTINUA,CONTINUA,CONTINUA,CONTINUA
1,170791577,202104,1,0,0,72,20,996.44,17159.47,1.58,...,,,,0.0,BAJA+1,BAJA+2,CONTINUA,CONTINUA,CONTINUA,CONTINUA
2,170792123,202104,1,0,0,59,17,-3416.25,-3341.44,1726.09,...,16051.84,2.0,0.0,914.94,BAJA+1,BAJA+2,CONTINUA,CONTINUA,CONTINUA,CONTINUA
3,170794566,202104,1,0,0,49,19,650.32,6127.41,271.77,...,0.0,0.0,0.0,1372.41,BAJA+1,BAJA+2,CONTINUA,CONTINUA,CONTINUA,CONTINUA
4,170794754,202104,1,0,3,32,19,-367.77,-2522.09,455.45,...,38210.47,6.0,3.0,21735.69,BAJA+1,BAJA+2,CONTINUA,CONTINUA,CONTINUA,CONTINUA


In [8]:
np.sort(df["foto_mes"].unique())

array([202007, 202008, 202009, 202010, 202011, 202012, 202101, 202102,
       202103, 202104, 202105, 202106, 202107, 202108, 202109])

In [9]:
drop_cols = [f"clase_ternaria_{i}" for i in MONTHS_INFERENCE]
drop_cols

['clase_ternaria_202104',
 'clase_ternaria_202105',
 'clase_ternaria_202106',
 'clase_ternaria_202107',
 'clase_ternaria_202108',
 'clase_ternaria_202109']

In [10]:
ground_truth = df[["numero_de_cliente","foto_mes", "clase_ternaria_202109"]]
ground_truth = ground_truth[ground_truth["foto_mes"]<=202107]
ground_truth

Unnamed: 0,numero_de_cliente,foto_mes,clase_ternaria_202109
0,170787102,202104,CONTINUA
1,170791577,202104,CONTINUA
2,170792123,202104,CONTINUA
3,170794566,202104,CONTINUA
4,170794754,202104,CONTINUA
...,...,...,...
2425602,68284360,202010,CONTINUA
2425603,69736551,202010,CONTINUA
2425604,69184218,202010,CONTINUA
2425605,69341118,202010,CONTINUA


In [11]:
ground_truth["ganancia"] = ground_truth["clase_ternaria_202109"].map({"BAJA+2":270000, "BAJA+1":-7000, "CONTINUA":-7000})
ground_truth.head()

Unnamed: 0,numero_de_cliente,foto_mes,clase_ternaria_202109,ganancia
0,170787102,202104,CONTINUA,-7000
1,170791577,202104,CONTINUA,-7000
2,170792123,202104,CONTINUA,-7000
3,170794566,202104,CONTINUA,-7000
4,170794754,202104,CONTINUA,-7000


In [12]:
test_months = []
training_months = sorted(MONTHS_BASELINE)
training_months = training_months[0:-1]

In [13]:
for month in MONTHS_INFERENCE:
    print("Last month:", month)
    tags_adv = {
        'stage': 'adversarial',
        'last_month': month,
    }
    test_months.append(month)
    
    df_test = df[df["foto_mes"].isin(test_months)].copy()
    df_train = df[df["foto_mes"].isin(training_months)].copy()

    print("Training months:", df_train["foto_mes"].unique())
    print("Testing months:", df_test["foto_mes"].unique())
    
    df_test = df_test.reset_index(drop=True)
    df_train = df_train.reset_index(drop=True)
    
    X_train = df_train.drop(columns=drop_cols, axis=1).copy()
    X_test = df_test.drop(columns=drop_cols, axis=1).copy()

    model = train_adversarial(X_train, X_test, "adversarial", tags_adv)
    

foto_mes es la variable que siempre queda en primer lugar, y si la eliminamos?

In [14]:
if True:
    drop_cols.append("foto_mes")

In [15]:
for month in MONTHS_INFERENCE:
    print("Last month:", month)
    tags_adv = {
        'stage': 'adversarial_sin_foto_mes',
        'last_month': month,
    }
    test_months.append(month)
    
    df_test = df[df["foto_mes"].isin(test_months)].copy()
    df_train = df[df["foto_mes"].isin(training_months)].copy()

    print("Training months:", df_train["foto_mes"].unique())
    print("Testing months:", df_test["foto_mes"].unique())
    
    df_test = df_test.reset_index(drop=True)
    df_train = df_train.reset_index(drop=True)
    
    X_train = df_train.drop(columns=drop_cols, axis=1).copy()
    X_test = df_test.drop(columns=drop_cols, axis=1).copy()

    model = train_adversarial(X_train, X_test, "adversarial", tags_adv)
    

Last month: 202104
Training months: [202007 202008 202012 202010 202102 202011 202009 202101]
Testing months: [202104]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Last month: 202105
Training months: [202007 202008 202012 202010 202102 202011 202009 202101]
Testing months: [202104 202105]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Last month: 202106
Training months: [202007 202008 202012 202010 202102 202011 202009 202101]
Testing months: [202104 202105 202106]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Last month: 202107
Training months: [202007 202008 202012 202010 202102 202011 202009 202101]
Testing months: [202104 202105 202106 202107]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Last month: 202108
Training months: [202007 202008 202012 202010 202102 202011 202009 202101]
Testing months: [202104 202105 202106 202107 202108]




Last month: 202109
Training months: [202007 202008 202012 202010 202102 202011 202009 202101]
Testing months: [202104 202105 202106 202107 202108 202109]


