In [17]:
import pandas as pd
from sklearn.model_selection import (
    train_test_split,
    cross_validate,
)

# Prepare input data
df = pd.read_csv("../assets/master_table.csv", sep=";", encoding="latin1")
df.drop(columns=["id"], inplace=True)
df['data'] = pd.to_datetime(df['data'], format='%Y-%m-%d')

# @TODO hash df input, to reproduce it later on 

# Split the data into train, holdout, and calibration sets
train_set, holdout_set = train_test_split(
    df,
    stratify=df['is_revenge_spending'],
    shuffle=True,
    test_size=0.20,
    random_state=42,
)

holdout_set, calibration_set = train_test_split(
    holdout_set,
    stratify=None,
    shuffle=True,
    test_size=0.10,
    random_state=42,
)

In [18]:
features = ['safra_abertura', 'cidade', 'estado', 'idade', 'sexo', 'limite_total',
       'limite_disp', 'data', 'valor', 'grupo_estabelecimento',
       'cidade_estabelecimento', 'pais_estabelecimento']

In [19]:
X_train = train_set[features].copy()
y_train = train_set['is_revenge_spending'].copy()

X_holdout = holdout_set[features].copy()
y_holdout = holdout_set['is_revenge_spending'].copy()

X_calibration = calibration_set[features].copy()
y_calibration = calibration_set['is_revenge_spending'].copy()

In [20]:
import numpy as np
from dirty_cat import GapEncoder, SuperVectorizer
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import (
    DropConstantFeatures,
    DropDuplicateFeatures,
) 
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer

DATETIME_FEATURES = [
    "month",
    "quarter",
    "week",
    "day_of_week",
    "day_of_month",
    "weekend",
    "quarter_start",
    "quarter_end",
    "year_start",
    "year_end",
]

def get_vectorizer(
        datetime_pipeline = make_pipeline(
            DropConstantFeatures(tol=0.998, missing_values="ignore"),
            DatetimeFeatures(
                missing_values="ignore", features_to_extract=DATETIME_FEATURES
            ),
        ),
        low_card_transformer = make_pipeline(
            DropConstantFeatures(tol=0.998, missing_values="ignore"),
            SimpleImputer(
                missing_values=np.nan,
                add_indicator=True,
                strategy="constant",
                fill_value="missing",
            ),
            OrdinalEncoder(
                handle_unknown="use_encoded_value", unknown_value=-1
            ),
        ),
        high_card_transformer = make_pipeline(
            DropConstantFeatures(tol=0.998, missing_values="ignore"),
            GapEncoder(hashing=True, random_state=42),
        ),
        numerical_transformer = make_pipeline(
            DropConstantFeatures(tol=0.998, missing_values="ignore"),
            DropDuplicateFeatures(),
            SimpleImputer(
                missing_values=np.nan, add_indicator=True, strategy="median"
            ),
        )
):
    return SuperVectorizer(
            auto_cast=True,
            n_jobs=2,
            low_card_cat_transformer=low_card_transformer,
            high_card_cat_transformer=high_card_transformer,
            numerical_transformer=numerical_transformer,
            datetime_transformer=datetime_pipeline,
            impute_missing="force",
            remainder="drop",
    )

vectorizer = get_vectorizer()
pipeline = make_pipeline(
    vectorizer, 
    RandomForestClassifier()
)


pipeline.fit(np.array(X_train), np.array(y_train))

  df = df.replace(STR_NA_VALUES + [None, "?", "..."], np.nan)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(value=np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(value=np.nan, inplace=True)


In [21]:
model_scores = cross_validate(
        pipeline,
        X_holdout,
        y_holdout,
        n_jobs=2,
    )

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(value=np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(value=np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always be

In [22]:
model_scores

{'fit_time': array([0.31137633, 0.30238771, 0.30260038, 0.3307538 , 0.23247051]),
 'score_time': array([0.03331971, 0.03441858, 0.035676  , 0.03293085, 0.03306746]),
 'test_score': array([0.94972067, 0.9494382 , 0.95505618, 0.96067416, 0.96067416])}

In [24]:
import joblib

model = joblib.load("pipeline.pkl")

In [28]:
import numpy as np
from pandas import Timestamp

data = np.array([201905, 'SAO PAULO             ', 'SP ', 39, 'M', 18000, 18126,
       Timestamp('2019-11-02 00:00:00'), 8, 'POSTO DE GAS', 'SAO PAULO',
       'BR'], dtype=object)

data = data.reshape(1, -1)

In [29]:
df[features].columns

Index(['safra_abertura', 'cidade', 'estado', 'idade', 'sexo', 'limite_total',
       'limite_disp', 'data', 'valor', 'grupo_estabelecimento',
       'cidade_estabelecimento', 'pais_estabelecimento'],
      dtype='object')

In [30]:
import numpy as np
from pandas import Timestamp

# Predict using the pipeline
prediction = model.predict(data)


  df = df.replace(STR_NA_VALUES + [None, "?", "..."], np.nan)


In [33]:
prediction[0]

0