In [39]:
import pandas as pd
from sklearn.model_selection import (
    train_test_split,
    cross_validate,
)

# Prepare input data
df = pd.read_csv("../assets/master_table.csv", sep=";", encoding="latin1")
df.drop(columns=["id"], inplace=True)
df['data'] = pd.to_datetime(df['data'], format='%d.%m.%Y')

# @TODO hash df input, to reproduce it later on 

# Split the data into train, holdout, and calibration sets
train_set, holdout_set = train_test_split(
    df,
    stratify=df['is_revenge_spending'],
    shuffle=True,
    test_size=0.20,
    random_state=42,
)

holdout_set, calibration_set = train_test_split(
    holdout_set,
    stratify=None,
    shuffle=True,
    test_size=0.10,
    random_state=42,
)

In [40]:
features = ['safra_abertura', 'cidade', 'estado', 'idade', 'sexo', 'limite_total',
       'limite_disp', 'data', ' valor ', 'grupo_estabelecimento',
       'cidade_estabelecimento', 'pais_estabelecimento']

In [41]:
X_train = train_set[features].copy()
y_train = train_set['is_revenge_spending'].copy()

X_holdout = holdout_set[features].copy()
y_holdout = holdout_set['is_revenge_spending'].copy()

X_calibration = calibration_set[features].copy()
y_calibration = calibration_set['is_revenge_spending'].copy()

In [42]:
import numpy as np
from dirty_cat import GapEncoder, SuperVectorizer
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import (
    DropConstantFeatures,
    DropDuplicateFeatures,
) 
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer

DATETIME_FEATURES = [
    "month",
    "quarter",
    "week",
    "day_of_week",
    "day_of_month",
    "weekend",
    "quarter_start",
    "quarter_end",
    "year_start",
    "year_end",
]

def get_vectorizer(
        datetime_pipeline = make_pipeline(
            DropConstantFeatures(tol=0.998, missing_values="ignore"),
            DatetimeFeatures(
                missing_values="ignore", features_to_extract=DATETIME_FEATURES
            ),
        ),
        low_card_transformer = make_pipeline(
            DropConstantFeatures(tol=0.998, missing_values="ignore"),
            SimpleImputer(
                missing_values=np.nan,
                add_indicator=True,
                strategy="constant",
                fill_value="missing",
            ),
            OrdinalEncoder(
                handle_unknown="use_encoded_value", unknown_value=-1
            ),
        ),
        high_card_transformer = make_pipeline(
            DropConstantFeatures(tol=0.998, missing_values="ignore"),
            GapEncoder(hashing=True, random_state=42),
        ),
        numerical_transformer = make_pipeline(
            DropConstantFeatures(tol=0.998, missing_values="ignore"),
            DropDuplicateFeatures(),
            SimpleImputer(
                missing_values=np.nan, add_indicator=True, strategy="median"
            ),
        )
):
    return SuperVectorizer(
            auto_cast=True,
            n_jobs=2,
            low_card_cat_transformer=low_card_transformer,
            high_card_cat_transformer=high_card_transformer,
            numerical_transformer=numerical_transformer,
            datetime_transformer=datetime_pipeline,
            impute_missing="force",
            remainder="drop",
    )

vectorizer = get_vectorizer()
pipeline = make_pipeline(
    vectorizer, 
    RandomForestClassifier()
)


pipeline.fit(X_train, y_train)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(value=np.nan, inplace=True)


In [43]:
pipeline

In [44]:
model_scores = cross_validate(
        pipeline,
        X_holdout,
        y_holdout,
        n_jobs=2,
    )

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(value=np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(value=np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always be

In [45]:
model_scores

{'fit_time': array([0.75090361, 0.59892106, 0.74548435, 0.63713503, 0.56815004]),
 'score_time': array([0.03444409, 0.04476118, 0.03496361, 0.034199  , 0.03517985]),
 'test_score': array([0.94413408, 0.95505618, 0.94382022, 0.94382022, 0.96067416])}

In [54]:
first_row = df.iloc[0].values


In [55]:
first_row

array([201405, 'CAMPO LIMPO PAULISTA  ', 'SP ', 37, 'F', 4700, 5605,
       Timestamp('2019-12-04 00:00:00'), '31', 'SERVI\x82O', 'SAO PAULO',
       'BR', 1, 119, 0], dtype=object)

In [59]:
import joblib

model = joblib.load("pipeline.pkl")



In [66]:
model.feature_names_in_

array(['safra_abertura', 'cidade', 'estado', 'idade', 'sexo',
       'limite_total', 'limite_disp', 'data', ' valor ',
       'grupo_estabelecimento', 'cidade_estabelecimento',
       'pais_estabelecimento'], dtype=object)

In [None]:
if not X_holdout.empty:
    model.predict(X_holdout.iloc[0])
