In [None]:
import numpy as np 
import pandas as pd 
import gc
import os

from sklearn.model_selection import StratifiedKFold
import time
from tensorflow import keras
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler, PowerTransformer
import joblib 

## Constants:

In [None]:
RANDOM_STATE = 84 

NAN_VALUE = 0

#k-folds:
FOLDS = 5

In [None]:
def seed_everything(seed):
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    keras.utils.set_random_seed(seed)
    
seed_everything(RANDOM_STATE)

In [None]:
%%time
train = pd.read_parquet("data_v29/train_processed.parquet")
train = train.sort_values("customer_ID").reset_index(drop=True)
print(train.shape)
features = [col for col in train.columns if "target" not in col and "customer_ID" not in col]
gc.collect()

In [None]:
# https://www.kaggle.com/code/carlmcbrideellis/tabular-classification-with-neural-networks-keras/notebook

### AmEx metric:

In [None]:
# https://www.kaggle.com/competitions/amex-default-prediction/discussion/328020
# https://www.kaggle.com/code/rohanrao/amex-competition-metric-implementations

def amex_metric_numpy(y_true: np.array, y_pred: np.array) -> float:

    # count of positives and negatives
    n_pos = y_true.sum()
    n_neg = y_true.shape[0] - n_pos

    # sorting by descring prediction values
    indices = np.argsort(y_pred)[::-1]
    preds, target = y_pred[indices], y_true[indices]

    # filter the top 4% by cumulative row weights
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_filter = cum_norm_weight <= 0.04

    # default rate captured at 4%
    d = target[four_pct_filter].sum() / n_pos

    # weighted gini coefficient
    lorentz = (target / n_pos).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()

    # max weighted gini coefficient
    gini_max = 10 * n_neg * (1 - 19 / (n_pos + 20 * n_neg))

    # normalized weighted gini coefficient
    g = gini / gini_max

    return 0.5 * (g + d)

In [None]:
original_cat_col = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
cat_col = [col + "_last" for col in original_cat_col]
cat_col += [col + "_2ndlast" for col in original_cat_col]

In [None]:
train.fillna(NAN_VALUE, inplace=True)

In [None]:
#%%time
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=RANDOM_STATE)
acc_list = []
y_val_full = []
y_pred_full = []
customer_id_list = []

auc = keras.metrics.AUC()

for fold, (train_index, val_index) in enumerate(skf.split(train[features], train["target"])):
    
    print(f"starting fold {fold}")
    x_train_fold, x_val_fold = train[features].iloc[train_index], train[features].iloc[val_index]
    y_train_fold, y_val_fold = train["target"].iloc[train_index], train["target"].iloc[val_index]
    print("starting transforming")
    scaler  = MaxAbsScaler() #StandardScaler() #MinMaxScaler()
    x_train_fold = scaler.fit_transform(x_train_fold)
    x_val_fold   = scaler.transform(x_val_fold)
    joblib.dump(scaler, f"ANN/ANN_scalers/scaler_fit_fold{fold}.pkl")
    print(f"Transformed and saved scaler fold {fold}")
    del scaler
    gc.collect()
    
    model = keras.Sequential(
    [
        keras.layers.Dense(units=128, activation="swish", input_shape=(x_train_fold.shape[-1],) ),
        keras.layers.Dropout(0.3), 
        keras.layers.Dense(units=64, activation="swish"),
        keras.layers.Dense(units=32, activation="swish"),

        
        # the output layer, with a single neuron
        keras.layers.Dense(units=1, activation="sigmoid"),
    ]
    )
    # initial_weights = model.get_weights()
    learning_rate = 1e-4

    model.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate), 
              loss="binary_crossentropy", 
              metrics=[auc],
             )
    earlystop = keras.callbacks.EarlyStopping(monitor="val_auc",
                                              patience = 20, 
                                              #min_delta=0.0001,
                                              restore_best_weights=True)
    print(f"ANN fold #: {fold}")
    model.fit(x_train_fold, y_train_fold, 
          epochs=200, 
          batch_size=2048, 
          validation_data=(x_val_fold, y_val_fold),
          verbose=1,
          callbacks= [earlystop],
                          )
    del x_train_fold, y_train_fold
    gc.collect()
#     tf.keras.backend.clear_session()
    print("Start predicting")
    global y_pred
    y_pred = model.predict(x_val_fold) #,batch_size=1000)
    model.save(f"ANN/models/ANN_model_fold{fold}")
    del x_val_fold, model
    gc.collect()
    print("metric")

    acc = amex_metric_numpy(y_val_fold.values, y_pred.ravel()) 
    print("appending")
    acc_list.append(acc)
    y_val_full.append(y_val_fold.to_numpy().ravel())
    y_pred_full.append(y_pred.ravel())
    customer_id_list.append(train["customer_ID"].iloc[val_index])
    print(f"fold {fold}, acc: {acc}")
    del y_val_fold, acc,y_pred
    
print(f"ANN mean: {np.mean(acc_list)}")


y_val_full_array = np.concatenate(y_val_full, axis=None)
y_pred_full_array = np.concatenate(y_pred_full, axis=None)
acc_full = amex_metric_numpy(y_val_full_array,y_pred_full_array)
print(f"Acc over full dataset: {acc_full}")

customer_id_list_conc = np.concatenate(customer_id_list, axis=None)
df_OOF = pd.DataFrame()
df_OOF["customer_ID"] = pd.Series(customer_id_list_conc)
df_OOF["prediction"] = pd.Series(y_pred_full_array)

del y_val_full_array, y_pred_full_array,acc_full
gc.collect()



In [None]:
# 0.7840882673961607 standardscaler
# 0.7853181450312552 maxabs
# 0.7835603347923932 minmax
# 0.7858811083977235 maxabs 

In [None]:
df_OOF

In [None]:
df_OOF.to_csv(f"ANN/ANN_V2_data29_seed{RANDOM_STATE}_OOF.csv")

In [None]:
del train
gc.collect()

# test:

In [None]:
%%time 
chunks = 8
test_rows = 924621

test_submission = pd.DataFrame()

for chunk in range(chunks):

    print(f"chunk:{chunk+1}")
    test = pd.read_parquet(f"data_v29/test_processed_chunk{chunk}.parquet")
    
    print(f"Loaded chunk#{chunk+1}")
    gc.collect()
    test_submission_chunk = pd.DataFrame()
    test_submission_chunk["customer_ID"] = test["customer_ID"]
    test = test.drop(columns=["customer_ID"])

    test.fillna(NAN_VALUE, inplace=True)
    
    predictions = []
    print("Starting predictions:")
    for fold in range(FOLDS):

        model = keras.models.load_model(f"ANN/models/ANN_model_fold{fold}")
        scaler = joblib.load(f"ANN/ANN_scalers/scaler_fit_fold{fold}.pkl")
        test_scaled   = scaler.transform(test)
        test_pred = model.predict(test_scaled)
        predictions.append(test_pred.ravel())
        
        print(f"prediction added for fold:{fold}")
        del model, test_pred,scaler
        gc.collect()
    mean_predictions = np.mean(np.column_stack(predictions), axis=1)
    test_submission_chunk["prediction"] = mean_predictions
    test_submission = test_submission.append(test_submission_chunk)
    del test_submission_chunk
    gc.collect()
    
    
print("Finished predicting!")

In [None]:
len(test_submission)

In [None]:
test_submission

In [None]:
# test_submission.describe()

In [None]:
test_submission.to_csv("ANN/ANN_V2_dataV29_submission.csv", index=False)