In [None]:
import pandas as pd
import gc
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import make_pipeline

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_auc_score

In [42]:
# Load the competition data
comp_data = pd.read_csv("data/competition_data.csv")

# Split into training and evaluation samples

train_data = comp_data[comp_data["ROW_ID"].isna()]
eval_data = comp_data[comp_data["ROW_ID"].notna()]
del comp_data
gc.collect()

train_data, validation_data = train_test_split(train_data, test_size=0.33, train_size=0.67, random_state=42)

y_train = train_data["conversion"]
X_train = train_data.drop(columns=["conversion", "ROW_ID"])
X_train = X_train.select_dtypes(include='number')

y_validation = validation_data["conversion"]
X_validation = validation_data.drop(columns=["conversion", "ROW_ID"])
X_validation = X_validation.select_dtypes(include='number')

del train_data
del validation_data

gc.collect()


0

In [43]:
dtc = make_pipeline(
    SimpleImputer(),
    RandomizedSearchCV(
        DecisionTreeClassifier(random_state=2345),
        param_distributions={
            "max_depth": [2, 4, 8, 16, 32, 64, 128, 256, 512, None],
            "min_samples_split": [2, 4, 8, 16, 32, 64, 128, 256, 512],
            "min_samples_leaf": [2, 4, 8, 16, 32, 64, 128, 256, 512],
            "max_features": [2, 4, 8, 16, 32, 64, 128, 256, 512, None],
        },
        n_iter=100,
        cv=3,
        random_state=2345,
        n_jobs=-1,
        verbose=2,
    )
)
dtc.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END max_depth=16, max_features=2, min_samples_leaf=256, min_samples_split=4; total time=   0.4s
[CV] END max_depth=16, max_features=2, min_samples_leaf=256, min_samples_split=4; total time=   0.4s
[CV] END max_depth=16, max_features=2, min_samples_leaf=256, min_samples_split=4; total time=   0.4s
[CV] END max_depth=None, max_features=4, min_samples_leaf=256, min_samples_split=32; total time=   0.5s
[CV] END max_depth=None, max_features=4, min_samples_leaf=256, min_samples_split=32; total time=   0.5s
[CV] END max_depth=None, max_features=4, min_samples_leaf=256, min_samples_split=32; total time=   0.4s
[CV] END max_depth=256, max_features=4, min_samples_leaf=2, min_samples_split=512; total time=   0.5s
[CV] END max_depth=256, max_features=4, min_samples_leaf=2, min_samples_split=512; total time=   0.5s
[CV] END max_depth=256, max_features=4, min_samples_leaf=2, min_samples_split=512; total time=   0.4s
[CV] END max_dep

In [50]:
rfs = make_pipeline(
    SimpleImputer(),
    RandomizedSearchCV(
        RandomForestClassifier(random_state=2345),
        param_distributions={
            "max_depth": [2, 4, 8, 16, 32, 64, 128, 256, 512, None],
            "min_samples_split": [2, 4, 8, 16, 32, 64, 128, 256, 512],
            "min_samples_leaf": [2, 4, 8, 16, 32, 64, 128, 256, 512],
            "max_features": [2, 4, 8, 16, 32, 64, 128, 256, 512, None],
        },
        n_iter=100,
        cv=3,
        random_state=2345,
        n_jobs=-1,
        verbose=2,
    )
)

rfs.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END max_depth=16, max_features=2, min_samples_leaf=256, min_samples_split=4; total time=  12.2s
[CV] END max_depth=16, max_features=2, min_samples_leaf=256, min_samples_split=4; total time=  12.4s
[CV] END max_depth=16, max_features=2, min_samples_leaf=256, min_samples_split=4; total time=  12.5s
[CV] END max_depth=None, max_features=4, min_samples_leaf=256, min_samples_split=32; total time=  21.8s
[CV] END max_depth=None, max_features=4, min_samples_leaf=256, min_samples_split=32; total time=  22.5s
[CV] END max_depth=None, max_features=4, min_samples_leaf=256, min_samples_split=32; total time=  23.0s
[CV] END max_depth=256, max_features=4, min_samples_leaf=2, min_samples_split=512; total time=  31.8s
[CV] END max_depth=256, max_features=4, min_samples_leaf=2, min_samples_split=512; total time=  33.0s
[CV] END max_depth=256, max_features=4, min_samples_leaf=2, min_samples_split=512; total time=  30.8s


In [47]:
#cls.score(X_validation, y_validation)

y_preds_cls = dtc.predict_proba(X_validation)[:, dtc.classes_ == 1].squeeze()
roc_auc_score(y_validation, y_preds_cls)

y_preds_rfs = rfs.predict_proba(X_validation)[:, rfs.classes_ == 1].squeeze()
roc_auc_score(y_validation, y_preds_rfs)


0.729350839606164

In [45]:
# Predict on the evaluation set
eval_data = eval_data.drop(columns=["conversion"])
eval_data = eval_data.select_dtypes(include='number')
y_preds = dtc.predict_proba(eval_data.drop(columns=["ROW_ID"]))[:, dtc.classes_ == 1].squeeze()

In [46]:
# Make the submission file
submission_df = pd.DataFrame({"ROW_ID": eval_data["ROW_ID"], "conversion": y_preds})
submission_df["ROW_ID"] = submission_df["ROW_ID"].astype(int)
submission_df.to_csv("los_simuladores_submission.csv", sep=",", index=False)