In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

# Load parquet data
train = pd.read_parquet("data/train.parquet")
val   = pd.read_parquet("data/val.parquet")
test  = pd.read_parquet("data/test.parquet")

train.shape, val.shape, test.shape


((307918, 23), (87384, 23), (43532, 23))

In [2]:
target = "skip"

features = [c for c in train.columns if c != target]

X_train = train[features]
y_train = train[target]

X_val   = val[features]
y_val   = val[target]


In [3]:
categorical = X_train.select_dtypes(include=["object"]).columns.tolist()
numeric     = X_train.select_dtypes(include=["number"]).columns.tolist()

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
        ("num", "passthrough", numeric),
    ]
)


In [4]:
log_reg = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", LogisticRegression(max_iter=200))
])

log_reg.fit(X_train, y_train)

preds_val = log_reg.predict_proba(X_val)[:, 1]

auc = roc_auc_score(y_val, preds_val)
auc


  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  ret = a @ b
  ret = a @ b
  ret = a @ b


np.float64(0.6405229840256639)

In [5]:
gb = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", GradientBoostingClassifier())
])

gb.fit(X_train, y_train)
preds_val = gb.predict_proba(X_val)[:, 1]

roc_auc_score(y_val, preds_val)


np.float64(0.6509182887392206)