# Import

In [None]:
import numpy as np

from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier

from sklearn.metrics import f1_score, accuracy_score, auc, roc_curve

# Load Data

In [None]:
cols = np.load('data/reshaped/imputed_pre_and_manifest_12m_5y.npz')['columns']
data = np.load('data/reshaped/imputed_pre_and_manifest_12m_5y.npz')['data'].reshape((-1, len(cols)))
data = data[~np.all(np.isnan(data), axis=1)]

# Train

In [None]:
X = data[:, cols != 'drive']
y = data[:, cols == 'drive'].reshape(-1)
counts = np.unique(y, return_counts=True)[1]
class_weight = dict(zip([0, 1], counts / counts.sum()))
kf = KFold(n_splits=10, random_state=0, shuffle=True)
metrics = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    est = LGBMClassifier(class_weight=class_weight)
    est.fit(X_train, y_train)
    y_test_pred = est.predict(X_test)
    fpr, tpr, thresholds = roc_curve(y_test, y_test_pred, pos_label=1)
    AUC = auc(fpr, tpr)
    ACC = accuracy_score(y_test, y_test_pred)
    F1 = f1_score(y_test, y_test_pred)
    metrics.append([AUC, ACC, F1])
np.array(metrics).mean(axis=0)