# Intro
This file contains a collection of potentially useful methods, mostly from discarded approaches.

In [None]:
import pandas as pd
def dirtify_prediction(prediction_request):
    """Adds poorly prepared data to the requested prediction, to make sure your model fail if it can't handle poorly prepared data."""
    prediction_request['EJ'][0]= pd.NA
    prediction_request['EJ'][1] = 'C'
    return prediction_request
#prediction_request = dirtify_prediction(prediction_request)

def replace_column_values(dataframe, name_of_columns_to_replace=[], replacement_dictionary={}):
    """Replaces values acording to the replacemente dictionary in multiple columns. of the dataframe"""
    for column_name in name_of_columns_to_replace:
        dataframe[column_name] = dataframe[column_name].replace(replacement_dictionary)
    return dataframe

In [None]:
def competition_log_loss(y_true, y_pred):
    '''Class weighted log loss'''
    N_0 = np.sum(1 - y_true)
    N_1 = np.sum(y_true)

    p_1 = np.clip(y_pred, 1e-15, 1 - 1e-15)
    p_0 = 1 - p_1

    log_loss_0 = -np.sum((1 - y_true) * np.log(p_0)) / N_0
    log_loss_1 = -np.sum(y_true * np.log(p_1)) / N_1

    return (log_loss_0 + log_loss_1)/2

# LR Ploting

In [None]:
def create_history_for_lr_choosing():
    """A method to create the history for a deep learning model that can be then used to plot the learning rate graph and choose the best LR"""
    #The technique for selecting the learning rate is covered in the [course](https://www.coursera.org/learn/tensorflow-sequences-time-series-and-prediction).
    #This is a copy pasted version of another notebook that did it manually but it should be parametrized instead. first separating the method to create a model.
    histories = []
    for i, (train_index, valid_index) in enumerate(skf.split(train.iloc[:,:-1], train.iloc[:,-1])):  
        ds_train, ds_valid, valid_x, valid_y, input_shape, preprocess_pipeline = pre_processing(
            train.iloc[train_index],train.iloc[valid_index], df)

        model = Sequential([
            Input(shape=input_shape),
            layers.Lambda(lambda x: tf.expand_dims(x, axis=-1)),

            layers.Conv1D(16,5,padding="same",
                      kernel_initializer='lecun_normal',
                      activation='selu'),
            layers.Conv1D(32,2,padding="same",
                      kernel_initializer='lecun_normal',
                      activation='selu'),
            layers.MaxPool1D(),

            layers.Flatten(),
            layers.AlphaDropout(0.5),
            layers.Dense(128,
                     kernel_initializer='lecun_normal',
                     activation="selu"),
            layers.Dense(1,activation="sigmoid")
        ])

        lr_schedule = tf.keras.callbacks.LearningRateScheduler(
            lambda epoch: 1e-8 * 10**(epoch / 20))

        model.compile(
            optimizer=tf.keras.optimizers.Adam(),
            loss= BalancedLogLoss(),
            metrics="accuracy"
        )

        history = model.fit(
            ds_train,
            validation_data=ds_valid,
            epochs=150,
            callbacks=[lr_schedule],
            verbose=0
        )
        test_x = preprocess_pipeline.transform(test.iloc[:,:-1])
        test_y = test.iloc[:,-1].to_numpy().astype("float32")
        pred = model(test_x)
        loss = model.loss(test_y,pred)
        histories.append(history)
        
def plot_lr(history:dict):
    #Define the learning rate array
    lrs = 1e-8 * (10 ** (np.arange(150) / 20))

    # Set the figure size
    plt.figure(figsize=(10, 6))

    # Set the grid
    plt.grid(True)

    # Plot the loss in log scale
    plt.semilogx(lrs, history.history["loss"],label="loss")
    plt.semilogx(lrs, history.history["val_loss"],label="val_loss")

    # Increase the tickmarks size
    plt.tick_params('both', length=10, width=1, which='both')
    plt.legend()
    # Set the plot boundaries
    plt.axis([1e-8, 1e-1, 0, 3])

#plot_lr(histories[0])

# Calibration

In [None]:
def probability_calibration_plot(y_true,
                                    y_pred,
                                    y_cali=None,
                                    n_bins=30,
                                    yerr_c=0.4,
                                    xylim=1,
                                    tick=0.1,
                                    calib_method=''):
    '''Makes a plot to show if the model probabilities are calibrated
        It helps aid the decision of wether to calibrate it or not.
        Calibration of uncalibrated models is important when the predicted probabilities are important.
        Extracted from: https://www.kaggle.com/code/sergiosaharovskiy/icr-iarc-2023-eda-and-submission?scriptVersionId=129914021&cellId=47
    '''
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn.calibration import calibration_curve
    palette = ['#302c36', '#037d97', '#E4591E', '#C09741',
            '#EC5B6D', '#90A6B1', '#6ca957', '#D8E3E2']
    prob_true, prob_pred = calibration_curve(y_true, y_pred, n_bins=n_bins)

    fig, ax = plt.subplots(1, 2, figsize=(15, 5), dpi=120)
    ax = ax.flatten()
    ax[0].errorbar(x=prob_pred, y=prob_true, yerr=abs(prob_true - prob_pred) * yerr_c, fmt=".k", label='Actual',
                    color=palette[1], capthick=0.5, capsize=3, elinewidth=0.7, ecolor=palette[1])

    sns.lineplot(x=np.linspace(0, xylim, 11), y=np.linspace(0, xylim, 11), color=palette[-3],
                    label='Perfectly calibrated', ax=ax[0], linestyle='dashed')

    if isinstance(y_cali, np.ndarray):
        prob_true_, prob_pred_ = calibration_curve(y_true, y_cali, n_bins=n_bins)
        sns.lineplot(x=prob_pred_, y=prob_true_, color=palette[-5],
                        label=f'{calib_method} Calibration', ax=ax[0], linestyle='solid')

    sns.histplot(y_pred, bins=n_bins*5, color=palette[1], ax=ax[1])
    for i, _ in enumerate(ax):
        ax[i].spines['top'].set_visible(False)
        ax[i].spines['right'].set_visible(False)
        ax[i].xaxis.grid(False)
        ax[i].yaxis.grid(True)

    ax[0].set_title(f'Probability calibration plot', fontdict={'fontweight': 'bold'})
    ax[1].set_title(f'Histogram of predictions', fontdict={'fontweight': 'bold'})

    ax[0].set_xticks(list(np.arange(0, xylim + tick, tick)))
    ax[0].set_yticks(list(np.arange(0, xylim + tick, tick)))
    ax[0].set(xlabel='predicted', ylabel='actual')
    fig.suptitle(f'Predictions in range {(0, xylim)}', ha='center',  fontweight='bold', fontsize=16)
    plt.tight_layout();
    
def create_logistic_calibrator(y_truths, y_predictions):
    '''Create a logistic calbirator that can be used to fit. Requires the input to be reshaped to -1, 1'''
    from sklearn.linear_model import LogisticRegression
    formated_test_y_predictions = y_predictions.reshape(-1, 1) ##Hace que sea un array de arrays de un elemento en vez de un array de numeros
    lr = LogisticRegression(C=99999999999, solver='liblinear', max_iter=1000)
    lr.fit(formated_test_y_predictions, y_truths)
    return lr

def create_isotonic_calibrator(y_truths, y_predictions):
    from sklearn.isotonic import IsotonicRegression
    isotonic_calibrator = IsotonicRegression(out_of_bounds='clip')
    isotonic_calibrator.fit(y_predictions, y_truths) #Hace que sea un array de arrays de un elemento en vez de un array de numeros
    return isotonic_calibrator

#Idea to use calibrators
#truth = model_dicts[0]['test_y_truth'] #truth es numpy
#preds = model_dicts[0]['test_y_pred']
#model = model_dicts[0]['model']

#iso_calibrator = create_isotonic_calibrator(truth, preds)
#log_calibrator = create_logistic_calibrator(truth, preds.numpy())

#iso_cal_preds = iso_calibrator.predict(preds)
#log_cal_preds = log_calibrator.predict(preds.numpy())

#from sklearn.calibration import CalibrationDisplay
#CalibrationDisplay.from_predictions(truth,preds)
#CalibrationDisplay.from_predictions(truth,iso_cal_preds)
#CalibrationDisplay.from_predictions(truth,log_cal_preds)
#print('uncalibrated', competition_log_loss(truth.astype("float32"), preds.numpy().flatten()))
#print('iso calibrated', competition_log_loss(truth, iso_cal_preds))
#print('logistic calibrated', competition_log_loss(truth, log_cal_preds))

# Ploting training history and results

In [None]:
def plot_history(history:list, start:int=None, end:int=None) -> None:
    fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
    axs[0].plot(history.history["loss"][start:end],label="loss")
    axs[0].plot(history.history["val_loss"][start:end],label="val_loss")
    axs[0].set_xlabel("Epochs")
    axs[0].set_ylabel("Loss")
    axs[0].set_title("Loss")
    axs[0].legend()

    axs[1].plot(history.history["accuracy"][start:end],label="accuracy")
    axs[1].plot(history.history["val_accuracy"][start:end],label="val_accuracy")
    axs[1].set_xlabel("Epochs")
    axs[1].set_ylabel("Accuracy")
    axs[1].set_title("Accuracy")
    axs[1].legend()
    plt.show()

#for model_dict in model_dicts:
#    print('loss(test_y,model(test_x))', model_dict['loss(test_y,model(test_x))'])
#    print('loss(test_y,model(test_df))', model_dict['loss(test_y,model(test_df))'])
#    print()

In [None]:
#Train test spliting with sklearn stratified kfold
#I wonder how it works exactly.
#from sklearn.model_selection import train_test_split,StratifiedKFold
#train,test = train_test_split(
#            train_df,
#            train_size=0.97,
#            random_state=1,
#            stratify=train_df.iloc[:,-1]
#    )
