## Libraries

In [3]:
import os
import re
import mlflow
import duckdb
import pandas as pd
from src.model.inference import predictions_per_seed
from src.preprocess.etl import extract, get_dataframe
from src.constants import (
    PATH_CLASE_BINARIA,
    QUERY_DF_TEST,
    QUERY_DF_TRAIN,
)
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
%reload_ext autoreload

In [5]:
os.environ['MLFLOW_ARTIFACT_ROOT']='gs://mlflow-artifacts-uribe/mlruns'
os.environ['MLFLOW_TRACKING_URI']='sqlite:///database/mlruns.db'

## Data

In [6]:
con = duckdb.connect(database=":memory:", read_only=False)

In [7]:
extract(con, PATH_CLASE_BINARIA)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [8]:
df_full = get_dataframe(con, QUERY_DF_TRAIN)
df_test = get_dataframe(con, QUERY_DF_TEST)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [9]:
con.close()

In [12]:
df_full = df_full.rename(columns=lambda x: re.sub("[^A-Za-z0-9_]+", "", x))
df_test = df_test.rename(columns=lambda x: re.sub("[^A-Za-z0-9_]+", "", x))

In [13]:
from src.constants import RANDOM_STATE
from sklearn.model_selection import train_test_split

df_full["stratify"] = df_full["clase_ternaria"].astype(str) + df_full["foto_mes"].astype(str)
df_train, df_valid = train_test_split(
    df_full, test_size=0.05, random_state=RANDOM_STATE, stratify=df_full["stratify"]
)

In [14]:
df_train = df_train.drop(columns=["stratify"], axis=1)
df_valid = df_valid.drop(columns=["stratify"], axis=1)

In [15]:
run_name = 'thundering-kite-46'
logged_model = 'runs:/a0819ce0fa5747b3b96dbda2d19c7f8e/model'
loaded_model = mlflow.lightgbm.load_model(logged_model)

Downloading artifacts:   0%|          | 0/6 [00:00<?, ?it/s]

In [16]:
loaded_model.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.2663348498414332,
 'max_depth': 253,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'num_leaves': 193,
 'objective': None,
 'random_state': 42,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'lambda_l1': 0.029252283499044344,
 'lambda_l2': 0.002858569462782785,
 'min_data_in_leaf': 6600,
 'min_gain_to_split': 2.030549544977002,
 'bagging_freq': 5,
 'bagging_fraction': 0.7000000000000001,
 'feature_fraction': 0.8,
 'verbosity': -1}

In [17]:
loaded_model.n_jobs = -1

In [18]:
loaded_model.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.2663348498414332,
 'max_depth': 253,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 193,
 'objective': None,
 'random_state': 42,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'lambda_l1': 0.029252283499044344,
 'lambda_l2': 0.002858569462782785,
 'min_data_in_leaf': 6600,
 'min_gain_to_split': 2.030549544977002,
 'bagging_freq': 5,
 'bagging_fraction': 0.7000000000000001,
 'feature_fraction': 0.8,
 'verbosity': -1}

In [None]:
predictions_per_seed(df_train, df_valid, df_test, loaded_model, run_name)

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[54]	train's auc: 0.976748	valid's auc: 0.975564


In [None]:
df_preds = pd.read_csv(f"../buckets/b1/datasets/processed/predictions/{run_name}/predictions.csv")

In [None]:
df_preds.head()

In [None]:
df_preds["Predicted"].value_counts()

In [None]:
df_preds["Predicted"].value_counts(normalize=True)