In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, RocCurveDisplay
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, PredefinedSplit
from scipy.stats import uniform
from xgboost import XGBRegressor

import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
static_df = pd.read_csv("datasets/static_data.csv", index_col=[0])

validation_df = pd.read_csv("datasets/validation_2020.csv", index_col=[0])
validation_df = validation_df.drop(columns=["FBFM3", "FBFM12", "FBFM13",
                                            'TMIN_prev1', 'TMIN_prev2', 'TMIN_prev3',
                                            'TMAX_prev1', 'TMAX_prev2', 'TMAX_prev3'])

test_df = pd.read_csv("datasets/test_2021.csv", index_col=[0])
test_df = test_df.drop(columns=["FBFM3", "FBFM12", "FBFM13",
                                'TMIN_prev1', 'TMIN_prev2', 'TMIN_prev3',
                                'TMAX_prev1', 'TMAX_prev2', 'TMAX_prev3'])

In [None]:
sample_datasets = {i: pd.read_csv(f"datasets/group/{i}.csv",
                                  index_col=[0]).drop(columns=["FBFM3", "FBFM12", "FBFM13",
                                                               'TMIN_prev1', 'TMIN_prev2', 'TMIN_prev3',
                                                               'TMAX_prev1', 'TMAX_prev2', 'TMAX_prev3']) for i in range(20)}

In [None]:
def profile_difference_score(clf, X, y):
    y_preds = clf.predict_proba(X)[:, 1]
    n = 426327
    diffs = []
    for i in range(12):
        profile = np.sum(y[i*n:(i+1)*n])
        predicted_profile = np.sum(y_preds[i*n:(i+1)*n])
        diffs.append(abs(profile - predicted_profile))
    return np.sum(diffs)

def breach_score(clf, X, y):
    y_preds = clf.predict_proba(X)[:, 1]
    n = 426327
    diffs = []
    for i in range(12):
        profile = np.sum(y[i*n:(i+1)*n])
        predicted_profile = np.sum(y_preds[i*n:(i+1)*n])
        if profile - predicted_profile <= 0:
            diffs.append(1)
        else:
            diffs.append(0)
    return np.sum(diffs)

def roc_auc(clf, X, y):
    y_preds = clf.predict_proba(X)[:, 1]
    return roc_auc_score(y, y_preds)

### Logisztikus regresszió

In [None]:
for i, sample_df in sample_datasets.items():
    training_data = pd.concat([sample_df, validation_df])
    training_data.reset_index(drop=True, inplace=True)
    
    X_train = training_data[[col for col in training_data.columns if col != 'TARGET']]
    
    Y_train = training_data["TARGET"]
    
    split_index = [-1] * sample_df.shape[0] + [0] * validation_df.shape[0]
    pds = PredefinedSplit(test_fold=split_index)
    
    model = LogisticRegression(solver='newton-cholesky')
    
    w = [{0: i, 1: 1} for i in range(50, 2001, 50)]
    parameters = {'class_weight': [None] + w}
    
    selector = GridSearchCV(model, parameters, 
                            refit=False, cv=pds, scoring={'profile_diff': profile_difference_score,
                                                          'breach_score': breach_score,
                                                          'auc': roc_auc},
                            verbose=3)
    selector.fit(X_train, Y_train)
    
    pd.DataFrame(selector.cv_results_).to_csv(f"models/hyperparam_data/logreg/{i}_gridsearch.csv")

## Tree based models

In [None]:
def profile_difference_score(clf, X, y):
    y_preds = clf.predict(X)
    n = 426327
    diffs = []
    for i in range(12):
        profile = np.sum(y[i*n:(i+1)*n])
        predicted_profile = np.sum(y_preds[i*n:(i+1)*n])
        diffs.append(abs(profile - predicted_profile))
    return np.sum(diffs)

def breach_score(clf, X, y):
    y_preds = clf.predict(X)
    n = 426327
    diffs = []
    
    for i in range(12):
        profile = np.sum(y[i*n:(i+1)*n])
        predicted_profile = np.sum(y_preds[i*n:(i+1)*n])
        if profile - predicted_profile <= 0:
            diffs.append(1)
        else:
            diffs.append(0)
    return np.sum(diffs)

def roc_auc(clf, X, y):
    y_preds = clf.predict(X)
    return roc_auc_score(y, y_preds)

### Random Forest

In [None]:
for i, sample_df in sample_datasets.items():
    training_data = pd.concat([sample_df, validation_df])
    training_data.reset_index(drop=True, inplace=True)
    
    X_train = training_data[[col for col in training_data.columns if col != 'TARGET']]
    
    Y_train = training_data["TARGET"]
    
    split_index = [-1] * sample_df.shape[0] + [0] * validation_df.shape[0]
    pds = PredefinedSplit(test_fold=split_index)
    
    model = RandomForestRegressor(max_features= 'sqrt')
    
    parameters = {
                    'n_estimators': [*range(50, 2001, 50)],
                    'min_samples_leaf': [*range(250, 10001, 250)]
                 }
    
    selector = RandomizedSearchCV(model, parameters, n_iter=50,
                            refit=False, cv=pds, scoring={'profile_diff': profile_difference_score,
                                                          'breach_score': breach_score,
                                                          'auc': roc_auc},
                            verbose=3, random_state=0)

    selector.fit(X_train, Y_train)
    
    pd.DataFrame(selector.cv_results_).to_csv(f"models/hyperparam_data/randomforest/{i}_gridsearch.csv")

### XGBRegressor

In [None]:
for i, sample_df in sample_datasets.items():
    training_data = pd.concat([sample_df, validation_df])
    training_data.reset_index(drop=True, inplace=True)
    
    X_train = training_data[[col for col in training_data.columns if col != 'TARGET']]
    
    Y_train = training_data["TARGET"]
    
    split_index = [-1] * sample_df.shape[0] + [0] * validation_df.shape[0]
    pds = PredefinedSplit(test_fold=split_index)
    
    model = XGBRegressor()
    
    parameters = {'n_estimators': [*range(50, 2001, 50)],
                 'max_depth': [2, 3, 5, 7, 9],
                 'learning_rate': [i/10 for i in range(1, 11)]}
    
    selector = RandomizedSearchCV(model, parameters, n_iter=50,
                            refit=False, cv=pds, scoring={'profile_diff': profile_difference_score,
                                                          'breach_score': breach_score,
                                                          'auc': roc_auc},
                            verbose=3, random_state=0)
    selector.fit(X_train, Y_train)
    
    pd.DataFrame(selector.cv_results_).to_csv(f"models/hyperparam_data/xgboost/{i}_gridsearch.csv")

### Best models

In [None]:
auc = None
profile_diff = None
breach_score = None

for i in range(20):
    df = pd.read_csv(f"model_selection/logreg/{i}_gridsearch.csv", index_col=[0])
    params = df["param_class_weight"][1:].apply(lambda x: eval(x)[0]).values
    if auc is None:
        auc = df["mean_test_auc"].values
    else:
        auc = np.vstack([auc, df["mean_test_auc"].values])
    
    if profile_diff is None:
        profile_diff = df["mean_test_profile_diff"].values
    else:
        profile_diff = np.vstack([profile_diff, df["mean_test_profile_diff"].values])
    
    if breach_score is None:
        breach_score = df["mean_test_breach_score"].values
    else:
        breach_score = np.vstack([breach_score, df["mean_test_breach_score"].values])

auc = auc.T
profile_diff = profile_diff.T
breach_score = breach_score.T

print("Logisztikus regresszió")
print("\tMaximum AUC: ", np.max(np.mean(auc, axis=1)))
print("\twith parameters: ", params[np.argmax(np.mean(auc, axis=1)) - 1])

In [None]:
auc = None
profile_diff = None
breach_score = None

for i in range(20):
    df = pd.read_csv(f"model_selection/randomforest/{i}_gridsearch.csv", index_col=[0])
    params = df["params"].apply(lambda x: eval(x)).values
    if auc is None:
        auc = df["mean_test_auc"].values
    else:
        auc = np.vstack([auc, df["mean_test_auc"].values])
    
    if profile_diff is None:
        profile_diff = df["mean_test_profile_diff"].values
    else:
        profile_diff = np.vstack([profile_diff, df["mean_test_profile_diff"].values])
    
    if breach_score is None:
        breach_score = df["mean_test_breach_score"].values
    else:
        breach_score = np.vstack([breach_score, df["mean_test_breach_score"].values])

auc = auc.T
profile_diff = profile_diff.T
breach_score = breach_score.T

print("Logisztikus regresszió")
print("\tMaximum AUC: ", np.max(np.mean(auc, axis=1)))
print("\twith parameters: ", params[np.argmax(np.mean(auc, axis=1))])

In [None]:
auc = None
profile_diff = None
breach_score = None

for i in range(20):
    df = pd.read_csv(f"model_selection/xgboost/{i}_gridsearch.csv", index_col=[0])
    params = df["params"].apply(lambda x: eval(x)).values
    if auc is None:
        auc = df["mean_test_auc"].values
    else:
        auc = np.vstack([auc, df["mean_test_auc"].values])
    
    if profile_diff is None:
        profile_diff = df["mean_test_profile_diff"].values
    else:
        profile_diff = np.vstack([profile_diff, df["mean_test_profile_diff"].values])
    
    if breach_score is None:
        breach_score = df["mean_test_breach_score"].values
    else:
        breach_score = np.vstack([breach_score, df["mean_test_breach_score"].values])

auc = auc.T
profile_diff = profile_diff.T
breach_score = breach_score.T

print("XGBRegressor")
print("\tMaximum AUC: ", np.max(np.mean(auc, axis=1)))
print("\twith parameters: ", params[np.argmax(np.mean(auc, axis=1))])

## Downscale optimization

In [None]:
def scores_logistic(clf, X, y):
    y_preds = clf.predict_proba(X)[:, 1]
    
    roc_auc = roc_auc_score(y, y_preds)
    
    n = 426327
    diffs = []
    
    output_pairs = []
    
    for i in range(12):
        profile = np.sum(y[i*n:(i+1)*n])
        predicted_profile = np.sum(y_preds[i*n:(i+1)*n])
        
        output_pairs.append((profile, predicted_profile))
        
        diffs.append(abs(profile - predicted_profile))
    return roc_auc, np.sum(diffs), output_pairs

def scores_tree(clf, X, y):
    y_preds = clf.predict(X)
    
    roc_auc = roc_auc_score(y, y_preds)
    
    n = 426327
    diffs = []
    output_pairs = []
    for i in range(12):
        profile = np.sum(y[i*n:(i+1)*n])
        predicted_profile = np.sum(y_preds[i*n:(i+1)*n])
        
        output_pairs.append((profile, predicted_profile))
        
        diffs.append(abs(profile - predicted_profile))
    return roc_auc, np.sum(diffs), output_pairs

In [None]:
y_valid = validation_df["TARGET"]
X_valid = validation_df[[col for col in validation_df if col != "TARGET"]]

optimal_scale_params = {'logistic': [], 'random_forest': [], 'xgboost': []}

for i, train_dataset in sample_datasets.items():
    
    y_train = train_dataset["TARGET"].copy()
    X_train = train_dataset[[col for col in train_dataset if col != "TARGET"]]
    
    model = LogisticRegression(class_weight={0: 1, 1: 1}, solver="newton-cholesky")
    
    model.fit(X_train, y_train)
    
    roc_auc, profile_diff, output_pairs = scores_logistic(model, X_valid, y_valid)
    
    actual = np.array(output_pairs)[:, 0]
    
    preds = np.array(output_pairs)[:, 1]
    
    min_diff = np.sum((preds - actual) ** 2)
    opt_c = 1

    for d in np.linspace(0.0001, 2, 20000):
        if min_diff > np.sum((d*preds - actual) ** 2):
            min_diff = np.sum((d*preds - actual) ** 2)
            opt_c = d
    
    optimal_scale_params['logistic'].append(opt_c)
    
    ##################################################
    
    model = RandomForestRegressor(n_estimators=250, min_samples_leaf=500)
    
    model.fit(X_train, y_train)
    
    roc_auc, profile_diff, output_pairs = scores_tree(model, X_valid, y_valid)
    
    actual = np.array(output_pairs)[:, 0]
    
    preds = np.array(output_pairs)[:, 1]
    
    min_diff = np.sum((preds - actual) ** 2)
    opt_c = 1

    for d in np.linspace(0.0001, 1, 10000):
        if min_diff > np.sum((d*preds - actual) ** 2):
            min_diff = np.sum((d*preds - actual) ** 2)
            opt_c = d
    
    optimal_scale_params['random_forest'].append(opt_c)
    
    ########################################################
    
    model = XGBRegressor(objective='binary:logistic', n_estimators=100, max_depth=5, learning_rate=0.1)
    
    model.fit(X_train, y_train)
    
    roc_auc, profile_diff, output_pairs = scores_tree(model, X_valid, y_valid)

    actual = np.array(output_pairs)[:, 0]
    
    preds = np.array(output_pairs)[:, 1]
    
    min_diff = np.sum((preds - actual) ** 2)
    opt_c = 1

    for d in np.linspace(0.0001, 1, 10000):
        if min_diff > np.sum((d*preds - actual) ** 2):
            min_diff = np.sum((d*preds - actual) ** 2)
            opt_c = d
    
    optimal_scale_params['xgboost'].append(opt_c)
    
pd.DataFrame(optimal_scale_params).to_csv("models/optimal_scale_params.csv")

## Saving best models

In [None]:
best_models = {"logistic": [], "random_forest": [], "xgboost": []}

In [None]:
for i, train_dataset in sample_datasets.items():
    y_train = train_dataset["TARGET"].copy()
    X_train = train_dataset[[col for col in train_dataset if col != "TARGET"]]

    model = LogisticRegression(class_weight={0: 1, 1: 1}, solver="newton-cholesky")
    
    model.fit(X_train, y_train)
    
    best_models["logistic"].append(model)
    
    #############################################
    
    model = RandomForestRegressor(n_estimators=250, min_samples_leaf=500)
    
    model.fit(X_train, y_train)
    
    best_models["random_forest"].append(model)
    
    ############################################
    
    model = XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.1)
    
    model.fit(X_train, y_train)
    
    best_models["xgboost"].append(model)
    
import joblib

for i in range(20):
    joblib.dump(best_models['logistic'][i], f'models/best_models/logreg_{i}.model')
    joblib.dump(best_models['random_forest'][i], f'models/best_models/rf_{i}.model')
    joblib.dump(best_models['xgboost'][i], f'models/best_models/xgboost_{i}.model')