# 03-01: Mutli-Threshold for F1

The core idea is to first do a global threshold to optimize the F1 score. Then leave all the samples classified as 0 in place, and do a second threshold to optimize the F1 score for the samples classified as 1.


In [1]:
import sys
import logging
from typing import Iterable, List, Tuple, Dict

import pandas as pd
import numpy as np

from tqdm.auto import tqdm
import matplotlib.pyplot as plt

from pycaret.classification import *
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split

### Configure Logging

In [2]:
logging.getLogger('matplotlib.font_manager').setLevel(logging.ERROR)

In [3]:
logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    level=logging.INFO,
    datefmt='%Y-%m-%d %H:%M:%S',
        handlers=[
        logging.StreamHandler(sys.stdout)
    ])

logging.info("Started")

2023-03-21 09:55:05 INFO     Started


## Data Collection

In [4]:
dtypes = {
    "session_id": np.int64,
    "elapsed_time": np.int32,
    "event_name": "category",
    "name": "category",
    "level": np.uint8,
    "page": "category",
    "room_coor_x": np.float32,
    "room_coor_y": np.float32,
    "screen_coor_x": np.float32,
    "screen_coor_y": np.float32,
    "hover_duration": np.float32,
    "text": "category",
    "fqid": "category",
    "room_fqid": "category",
    "text_fqid": "category",
    "fullscreen": "category",
    "hq": "category",
    "music": "category",
    "level_group": "category",
}

In [5]:
# load the source training set
df_source = pd.read_csv('../data/train.csv.gz',
                        compression='gzip',
                        dtype=dtypes)

print(df_source.shape)
with pd.option_context('display.max_columns', None):
    display(df_source.head(3))

(13174211, 20)


Unnamed: 0,session_id,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,fqid,room_fqid,text_fqid,fullscreen,hq,music,level_group
0,20090312431273200,0,0,cutscene_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,undefined,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,,,,0-4
1,20090312431273200,1,1323,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,"Whatcha doing over there, Jo?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,,,,0-4
2,20090312431273200,2,831,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,Just talking to Teddy.,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,,,,0-4


In [6]:
# load the source training labels
df_source_labels = pd.read_csv('../data/train_labels.csv')

print(df_source_labels.shape)
with pd.option_context('display.max_columns', None):
    display(df_source_labels.head(3))

(212022, 2)


Unnamed: 0,session_id,correct
0,20090312431273200_q1,1
1,20090312433251036_q1,0
2,20090314121766812_q1,1


## Data Preparation & Cleaning

In [7]:
def map_question_to_level_group(question_number) ->str:
    """
    Maps the question number to the level group.

    Parameters
    ----------
    question_number : int
        The question number.

    Returns
    -------
    str
        The level group.
    """
    if question_number in [1, 2, 3]:
        return '0-4'
    elif question_number in [4, 5, 6, 7, 8, 9, 10, 11, 12, 13]:
        return '5-12'
    elif question_number in [14, 15, 16, 17, 18]:
        return '13-22'
    else:
        return None

In [8]:
def find_problem_sessions(data : pd.DataFrame) -> List[str]:
    """
    Finds the sessions that are duplicated on session_id and index. And
    Find sessions with reversed indexes.

    This idea is taken from the following Kaggle notebook:
    https://www.kaggle.com/code/abaojiang/eda-on-game-progress/notebook?scriptVersionId=120133716
    
    Parameters
    ----------
    data : pd.DataFrame
        The data to search.

    Returns
    -------
    List[str]
        The list of session ids that have a problem.
    """

    # find sessions duplicated on session_id and index
    sessions_with_duplicates = df_source.loc[
        data.duplicated(subset=["session_id", "index"], keep=False)] \
        ["session_id"].unique().tolist()


    # find sessions with reversed indexes
    sessions_with_reversed_index = []
    for sess_id, gp in df_source.groupby("session_id", observed=True):
        if not gp["index"].is_monotonic_increasing:
            sessions_with_reversed_index.append(sess_id)

    # via experimentation these sessions have been found to have time 
    # differences < -2000
    negative_time_diff_sessions = [
        '21030417085341900', '21070111080982292', 
        '21090108302064196', '21090409222921812']

    # combine the two lists into a single set
    return set(sessions_with_duplicates + 
               sessions_with_reversed_index + 
               negative_time_diff_sessions)

In [9]:
def prepare_label_dataset(data : pd.DataFrame) -> pd.DataFrame:
    """
    Prepares the label dataset and add columns for the level group 
    and the question number.

    Parameters
    ----------
    data : pd.DataFrame
        The label dataset.

    Returns
    -------
    pd.DataFrame
        The prepared label dataset.
    """
    # add the columns to determine the level group
    df_labels = data \
        .rename(columns={'session_id': 'id'}) \
        .assign(session_id=lambda df: df['id'].str.split('_').str[0].astype(int)) \
        .assign(question_id=lambda df: df['id'].str.split('_').str[1]) \
        .assign(question_num=lambda df: df['question_id'].str[1:].astype(int)) \
        [['session_id', 'question_num', 'correct']]
    
    # add the level group column
    df_labels['level_group'] = df_labels['question_num'] \
        .apply(map_question_to_level_group) 
        
    return df_labels

In [10]:
def prepare_main_dataset(data : pd.DataFrame,
                         elapsed_time_min_clip:int=0,
                         elapsed_time_max_clip:int=3691298) -> pd.DataFrame:
    """
    Prepares the main dataset by removing duplicates and removing 
    columns that are not needed.

    Parameters
    ----------
    data : pd.DataFrame
        The main dataset.

    Returns
    -------
    pd.DataFrame
        The prepared main dataset.
    """
    empty_columns = ['fullscreen', 'hq', 'music', 'page', 'hover_duration']

    df_main = data \
        .drop_duplicates() \
        .reset_index(drop=True) \
        .drop(empty_columns, axis=1) \
        .drop('text', axis=1)
    
   # clip the elapsed time to remove outliers
    df_main['elapsed_time'] = df_main['elapsed_time'].clip(
        lower=elapsed_time_min_clip,
        upper=elapsed_time_max_clip)
    
    return df_main

### Process the Data

In [11]:
# prepare the main dataset
df_source = prepare_main_dataset(df_source, 
                                 elapsed_time_min_clip=0,
                                 elapsed_time_max_clip=3691298)

# remove sessions with problems
problem_sessions = find_problem_sessions(df_source)
df_source = df_source[~df_source['session_id'].isin(problem_sessions)]

with pd.option_context('display.max_columns', None):
    print(df_source.shape)
    display(df_source.head(3))  

(13019794, 14)


Unnamed: 0,session_id,index,elapsed_time,event_name,name,level,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,fqid,room_fqid,text_fqid,level_group
0,20090312431273200,0,0,cutscene_click,basic,0,-413.991394,-159.314682,380.0,494.0,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,0-4
1,20090312431273200,1,1323,person_click,basic,0,-413.991394,-159.314682,380.0,494.0,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0-4
2,20090312431273200,2,831,person_click,basic,0,-413.991394,-159.314682,380.0,494.0,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0-4


In [12]:
# prepare the label dataset
df_source_labels = prepare_label_dataset(df_source_labels)

# remove sessions with problems
df_source_labels = df_source_labels[~df_source_labels['session_id'] \
    .isin(problem_sessions)]

with pd.option_context('display.max_columns', None):
    display(df_source_labels.sample(n=3, random_state=51))

Unnamed: 0,session_id,question_num,correct,level_group
172317,21070319253640464,15,0,13-22
194865,21040512553883790,17,1,13-22
197728,22000108514966796,17,1,13-22


## Feature Engineering

### Functions

In [13]:
def create_initial_features(X:pd.DataFrame,
                            y:pd.DataFrame) -> pd.DataFrame:
    """
    Creates the initial dataset to which additional features 
    will be added.

    Parameters
    ----------
    X : pd.DataFrame
        The main dataset.
    y : pd.DataFrame
        The label dataset.

    Returns
    -------
    pd.DataFrame
        The initial feature dataset.
    """
    df_features =  y \
        .groupby(['session_id', 'level_group']) \
        .agg({'correct': ['count']}) \
        .reset_index() \
        .droplevel(1, axis=1) \
        .drop(columns=['correct']) \
        .sort_values(['session_id', 'level_group']) \
        
    # set the session_id to be an integer
    df_features['session_id'] = df_features['session_id'].astype(int)
        
    return df_features        

In [14]:
def add_numeric_column_features(features:pd.DataFrame,
                                X:pd.DataFrame,
                                column:str,
                                min_values:dict=None,
                                max_values:dict=None) -> pd.DataFrame:
    """
    Add the maximum elapsed time feature to the features dataset.

    Parameters
    ----------
    features : pd.DataFrame
        The features dataset.
    X : pd.DataFrame
        The main dataset.
    column : str
        The name of the numeric column to add to the features for.

    Returns
    -------
    None
    """
    # Define a function to calculate mode
    def mode(series):
        return series.mode().iat[0]

    # calculate the maximum, minimum and mean for the column
    df_result = X \
        .groupby(['session_id', 'level_group']) \
        .agg({column: ['sum', 'max', 'min', 'mean', mode]}) \
        .reset_index()
    
    # flatten the multi-index columns
    df_result.columns = [
        '_'.join(col).rstrip('_') for col in df_result.columns.values
    ]

    # normalize the values
    if min_values is None or max_values is None:
        logging.warning('Not normalizing the values, min_value and max_values are not set.')
    else:
        metric_list = ['sum', 'max', 'min', 'mean', 'mode']
        for metric in metric_list:
            current_column = f'{column}_{metric}'
            df_result[current_column] = \
                (df_result[current_column] - min_values[metric]) / \
                (max_values[metric] - min_values[metric])       

    # join the features to the result   
    df_result = features.set_index(['session_id', 'level_group']) \
        .join(df_result.set_index(['session_id', 'level_group']), how='left') \
        .reset_index()
    
    return df_result

### Create Features

#### Initial Features

In [15]:
# create the initial features
df_features = create_initial_features(df_source, df_source_labels)

with pd.option_context('display.max_columns', None):
    display(df_features.head(6))

Unnamed: 0,session_id,level_group
0,20090312431273200,0-4
1,20090312431273200,13-22
2,20090312431273200,5-12
3,20090312433251036,0-4
4,20090312433251036,13-22
5,20090312433251036,5-12


#### Elapsed Time

In [16]:
# add the elapsed time features to the features dataset
df_features = add_numeric_column_features(
    features=df_features,
    X=df_source,
    column='elapsed_time',
    min_values={
        'sum': 61395.0,
        'max':  990.0,
        'min':  0.0,
        'mean': 526.447,
        'mode': 0.0},
    max_values={
        'sum':  9990648000,
        'max':  3691298.0,
        'min':  3691298.0,
        'mean': 3691298.0,
        'mode': 3691298.0})

with pd.option_context('display.max_columns', None):
    display(df_features.head(6))

Unnamed: 0,session_id,level_group,elapsed_time_sum,elapsed_time_max,elapsed_time_min,elapsed_time_mean,elapsed_time_mode
0,20090312431273200,0-4,0.001411,0.052535,0.0,0.023103,0.0
1,20090312431273200,13-22,0.04374,0.344602,0.226677,0.281804,0.30132
2,20090312431273200,5-12,0.010577,0.135014,0.060002,0.096641,0.060002
3,20090312433251036,0-4,0.001352,0.063074,0.0,0.026311,0.0
4,20090312433251036,13-22,0.324157,1.0,0.318718,0.676403,1.0
5,20090312433251036,5-12,0.021933,0.221287,0.072301,0.150206,0.072301


## Data Selection

### Functions

In [17]:
def select_sessions(
        y: pd.DataFrame,
        random_state: int=1337,
        test_size: float=0.2,
        train_size:float=0.6) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    Select samples from the dataset for training, validation and testing.
    The test set is selected first, then the training set is selected 
    from the remaining sessions. And finally the validation set is 
    selected from the remaining sessions.

    Parameters
    ----------
    y : pd.DataFrame
        The label dataset.
    random_state : int
        The random state to use.
    test_size : float
        The ratio of the sample to use for testing.
    train_size : float
        The ratio of the sample to use for training.

    Returns
    -------
    Tuple[np.ndarray, np.ndarray, np.ndarray]
        The selected session ids, the main dataset and the label dataset.
    """
    # select all the unique session ids
    all_session_ids = y['session_id'].unique()

    # set the random seed
    np.random.seed(random_state)

    # shuffle the session ids
    np.random.shuffle(all_session_ids)

    # select the session ids for the test set
    test, remainder = train_test_split(all_session_ids, test_size=1-test_size)

    # split the dataset into train and validation sets
    train, val = train_test_split(remainder, test_size=1-train_size)

    # print the number of sessions in each set
    print(f'Train: {len(train)}')
    print(f'Validation: {len(val)}')
    print(f'Test: {len(test)}')

    return train, val, test

In [18]:
def get_features_with_labels(features:pd.DataFrame,
                            y:pd.DataFrame) -> pd.DataFrame:
    """
    Create a complete dataset where the label dataset containing the
    target variable is joint to the features dataset.
    """
    return y.join(features.set_index(['session_id', 'level_group']),
                    on=['session_id', 'level_group'],
                    how='left')

### Create Datasets

In [19]:
random_state = 51

In [20]:
# split the dataset into train, validation and test sets
train, val, test = select_sessions(
    y=df_source_labels,
    random_state=random_state,
    test_size=0.60,
    train_size=0.75)

Train: 3495
Validation: 1165
Test: 6988


In [21]:
# get the prepared dataset
features_with_labels = get_features_with_labels(df_features, df_source_labels)

with pd.option_context('display.max_columns', None):
    display(features_with_labels.head(6))

Unnamed: 0,session_id,question_num,correct,level_group,elapsed_time_sum,elapsed_time_max,elapsed_time_min,elapsed_time_mean,elapsed_time_mode
0,20090312431273200,1,1,0-4,0.001411,0.052535,0.0,0.023103,0.0
1,20090312433251036,1,0,0-4,0.001352,0.063074,0.0,0.026311,0.0
2,20090314121766812,1,1,0-4,0.002928,0.106324,0.0,0.047996,0.0
3,20090314363702160,1,1,0-4,0.001627,0.05869,0.0,0.030143,0.0
4,20090314441803444,1,1,0-4,0.000824,0.047682,0.0,0.020862,0.0
5,20090315081004164,1,0,0-4,0.002515,0.092231,0.0,0.036151,0.0


In [22]:
# get the features for the train, validation and test datasets
df_train = features_with_labels[features_with_labels['session_id'].isin(train)] \
    .drop(columns=['session_id', 'level_group'])

df_val = features_with_labels[features_with_labels['session_id'].isin(val)] \
    .drop(columns=['session_id', 'level_group'])
    
df_test = features_with_labels[features_with_labels['session_id'].isin(test)] \
    .drop(columns=['session_id', 'level_group'])

In [23]:
print('Train      :', df_train.shape)
print('Validation :', df_val.shape)
print('Test       :', df_test.shape)

Train      : (62910, 7)
Validation : (20970, 7)
Test       : (125784, 7)


In [24]:
with pd.option_context('display.max_columns', None):
    display(df_train.head(6))

Unnamed: 0,question_num,correct,elapsed_time_sum,elapsed_time_max,elapsed_time_min,elapsed_time_mean,elapsed_time_mode
1,1,0,0.001352,0.063074,0.0,0.026311,0.0
4,1,1,0.000824,0.047682,0.0,0.020862,0.0
5,1,0,0.002515,0.092231,0.0,0.036151,0.0
9,1,0,0.002094,0.088831,0.0,0.037267,0.0
10,1,1,0.006778,0.2233,0.0,0.077021,0.0
13,1,1,0.002453,0.085622,0.0,0.039016,0.0


## Model Training

### Functions

In [25]:
def optimize_f1(y_true: np.ndarray, y_score: np.ndarray) -> Tuple[float, float, float, float]:
    """
    Optimize the F1 score.

    Parameters
    ----------
    y_true : np.ndarray
        The true labels.
    y_score : np.ndarray
        The predicted labels.

    Returns
    -------
    Tuple[float, float, float]
        The optimized threshold, precision, and recall.
    """
    best_f1 = 0
    best_threshold = 0
    best_precision = 0
    best_recall = 0

    for threshold in np.arange(0, 1, 0.01):
        y_pred = (y_score > threshold).astype(int)
        precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro', zero_division=1)

        if f1 > best_f1:
            #print(f'new best f1: {f1}, threshold: {threshold}, precision: {precision}, recall: {recall}')
            best_f1 = f1
            best_threshold = threshold
            best_precision = precision
            best_recall = recall

    return best_threshold, best_precision, best_recall, best_f1

### Train Models

In [26]:
classifier = setup(
    data=df_train,
    target='correct',
    test_data=df_val,
    session_id=random_state,
    fix_imbalance=True,
    fix_imbalance_method='RandomOverSampler',
    use_gpu=False,
    html=True,
    verbose=True)

                    Description              Value
0                    Session id                 51
1                        Target            correct
2                   Target type             Binary
3           Original data shape         (83880, 7)
4        Transformed data shape        (109686, 7)
5   Transformed train set shape         (88716, 7)
6    Transformed test set shape         (20970, 7)
7              Numeric features                  6
8                    Preprocess               True
9               Imputation type             simple
10           Numeric imputation               mean
11       Categorical imputation               mode
12                Fix imbalance               True
13         Fix imbalance method  RandomOverSampler
14               Fold Generator    StratifiedKFold
15                  Fold Number                 10
16                     CPU Jobs                 -1
17                      Use GPU              False
18               Log Experiment

In [27]:
top_model = compare_models(n_select=15, sort='AUC')

Processing:  34%|███▍      | 25/74 [00:05<00:08,  5.87it/s]

KeyboardInterrupt: 

### Select a Target Model

Plot the evaluation metrics for the target model and pick the model with the best macro average F1 score.

In [None]:
for model in top_model:
    df_predicted = predict_model(estimator=model, data=df_test)
    print(classification_report(y_true=df_predicted.correct, y_pred=df_predicted.prediction_label))

### Light Gradient Boosting Machine

This looks like a good candidate for the target model.

In [None]:
# create the model
lgbm_model = create_model('lightgbm')

In [None]:
# evaluate the model on the test set
df_predicted = predict_model(estimator=lgbm_model, data=df_test)
print(classification_report(y_true=df_predicted.correct, y_pred=df_predicted.prediction_label))

The F1 macro average looks good and similar to what we have seen with previous deep learning experiments.

We shall now try and use the build in method to tune the model for F1 and see if we get a better result.

In [None]:
# attempt to tune the model
tuned_lgbm_model = tune_model(estimator=lgbm_model, optimize='F1')

In [None]:
# evaluate the tuned model on the test set
df_predicted = predict_model(estimator=tuned_lgbm_model, data=df_test)
#print(classification_report(y_true=df_predicted.correct, y_pred=df_predicted.prediction_label))

The Tuned model produce worst results so we will stick to the original.

We will now try to find a threshold that optimizes the F1 score.

In [None]:
# combine the training and validation sets
df_train_val = pd.concat([df_train, df_val])

# get the predictions
df_predicted = predict_model(estimator=lgbm_model, data=df_train_val, raw_score=True)
y_true = df_predicted.correct
y_score = df_predicted.prediction_score_1

# print()
# print('--- Train and Validation without threshold---')
# print(classification_report(y_true=df_predicted.correct, y_pred=df_predicted.prediction_label))

# print()
# # print('--- Train and Validation WITH threshold---')
# print()
# print(classification_report(y_true=df_predicted.correct, y_pred=(df_predicted.prediction_score_1 > threshold)))

# optimize the F1 score
threshold, precision, recall, f1 = optimize_f1(y_true.values, y_score.values)
print(f'Optimized F1: {f1:.2f}')
print(f'Optimized threshold: {threshold:.2f}')


# evaluate the model on the test set
df_test_predicted = predict_model(estimator=lgbm_model, data=df_test, raw_score=True)
y_true_pred = df_test_predicted.correct

print()
print('--- Test evaluation without threshold---')
print(classification_report(y_true=y_true_pred, y_pred=df_test_predicted.prediction_label))
disp = ConfusionMatrixDisplay.from_predictions(
    y_true=y_true_pred, 
    y_pred= df_test_predicted.prediction_label,
    cmap=plt.cm.Blues,
    normalize='true')
plt.show()

print()
print('--- Test evaluation WITH threshold---')
y_pred = (df_test_predicted.prediction_score_1 > threshold).astype(int)
print(classification_report(y_true=y_true_pred, y_pred=y_pred))

disp = ConfusionMatrixDisplay.from_predictions(
    y_true=y_true_pred, 
    y_pred=y_pred,
    cmap=plt.cm.Blues,
    normalize='true')
plt.show()

We observe the following:

1. The original model is better at detecting the negative class (0) than the second model with the optimized f1 threshold. Thus it has a better true negative rate (TNR).
2. Using the threshold for the second model we get a better true positive rate (TPR).

An idea would thus be to do the prediction with the original model, and leave all negative predictions in place. Then do a second prediction with the second model and replace the positive predictions with the second model predictions.

We can in fact try and optimize the F1 score for the second model by only considering the positive predictions and finding the optimal threshold for that.

We start with the training and validation data that was used during training to find the threshold.

In [None]:
# do predictions with the original model without any threshold
df_predictions = predict_model(estimator=lgbm_model, data=df_train_val, raw_score=True)

# select the rows that was predicted as correct
df_correct = df_predictions[df_predictions['prediction_label'] == 1]

# optimize the F1 score
threshold, precision, recall, f1 = optimize_f1(df_correct.correct.values, df_correct.prediction_score_1.values)
print(f'Optimized threshold: {threshold:.2f}')

# add a final prediction column, if the predicted label is 0, leave it 
# as it is, otherwise apply the threshold
df_predictions['final_prediction'] = df_predictions['prediction_label']
df_predictions.loc[df_predictions['prediction_label'] == 1, 'final_prediction'] = \
    (df_predictions['prediction_score_1'] > threshold).astype(int)

# how many predictions were changes
total = df_predictions.shape[0]
changed_count = df_predictions.query('final_prediction != prediction_label').shape[0]
print(f'Changed: {changed_count} out of {total} ({changed_count / total:.2f})')

# show the results
print(classification_report(y_true=df_predictions.correct, y_pred=df_predictions.final_prediction))

disp = ConfusionMatrixDisplay.from_predictions(
    y_true=y_true_pred, 
    y_pred=y_pred,
    cmap=plt.cm.Blues,
    normalize='true')

This results in a better f1-score for the negative class, but at the expense of the macro average. What we can try instead if to again leave the zeros in place, and only updates the ones, but the optimizer should consider the whole dataset and not just the ones when finding the threshold.

In [None]:
def special_optimizer(data:pd.DataFrame) -> Tuple[float, float, float, float]:
    """
    Find the best threshold for the F1 score, but only apply it to the 
    rows that was predicted as correct.
    """
    best_f1 = 0
    best_threshold = 0
    best_precision = 0
    best_recall = 0
    df_data = data.copy()

    for threshold in np.arange(0, 1, 0.01):
        df_data['test_prediction'] = df_data['prediction_label']
        df_data.loc[df_data['prediction_label'] == 1, 'test_prediction'] = \
            (df_data['prediction_score_1'] > threshold).astype(int)

        precision, recall, f1, _ = precision_recall_fscore_support(
            y_true = df_data.correct,
            y_pred = df_data.test_prediction,
            average='macro', zero_division=1)

        print(f'Current F1 {f1}, best F1: {best_f1}')
        if f1 > best_f1:
            print(f'new best f1: {f1}, threshold: {threshold}, precision: {precision}, recall: {recall}')
            best_f1 = f1
            best_threshold = threshold
            best_precision = precision
            best_recall = recall

    return best_threshold, best_precision, best_recall, best_f1    

# test the function
threshold, precision, recall, f1 = special_optimizer(df_predictions)
print(threshold)        

As shown this does not lead to any improvement at all since we have already optimized the threshold for the second model.

The next thing to do would be to create individual models for each question, but this has already been done in the Kaggle submissions. We can try it here simply for completeness to see if it leads to any improvement. If so we can take it back to the Deep Learning model work.
