# 02-03 : Brute Force Search

Use a brute force search to find a simple mask for "predicting" the correct answer.

This is taken from `21_feature_engineering` and the Kaggle notebook to prove the point:
[silly_monkey](https://www.kaggle.com/code/shambi/silly-monkey)

In [1]:
import sys
import numpy as np
import pandas as pd
import logging

from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

import multiprocessing as mp
from functools import partial

from tqdm.auto import tqdm
from tqdm.contrib.concurrent import process_map

## Configure Logging

In [2]:
logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    level=logging.INFO,
    datefmt='%Y-%m-%d %H:%M:%S',
        handlers=[
        logging.StreamHandler(sys.stdout)
    ])

logging.info("Started")

2023-03-17 08:40:07 INFO     Started


## Load Source Data

In [3]:
# load the source training labels
df_source_labels = pd.read_csv('../data/train_labels.csv')

print(df_source_labels.shape)
with pd.option_context('display.max_columns', None):
    display(df_source_labels.head(3))

(212022, 2)


Unnamed: 0,session_id,correct
0,20090312431273200_q1,1
1,20090312433251036_q1,0
2,20090314121766812_q1,1


In [4]:
# extract the question number
df_source_labels['question_number'] = df_source_labels.session_id \
    .str.extract(r'_q(\d+)') \
    .astype(int)

print(df_source_labels.shape)
with pd.option_context('display.max_columns', None):
    display(df_source_labels.sample(n=3))

(212022, 3)


Unnamed: 0,session_id,correct,question_number
93236,22040508163595230_q8,1,8
187577,22050112493234444_q16,1,16
200963,20110112170904428_q18,1,18


## Functions

In [5]:
def predict_single(model:np.array, question_number:int) -> int:
    """
    Predicts the answer to a single question
    """
    return model[question_number-1]

# test the function
mask = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1])

question_number = 1
print(f'Question {question_number}: {predict_single(mask, question_number)}')
question_number = 10
print(f'Question {question_number}: {predict_single(mask, question_number)}')
question_number = 11
print(f'Question {question_number}: {predict_single(mask, question_number)}')

Question 1: 1
Question 10: 0
Question 11: 1


In [6]:
def predict(model:np.array, question_numbers:np.array) -> np.array:
    """
    Predicts the answer to a list of questions
    """
    return np.array(
        [predict_single(model, question_number) for question_number in question_numbers])

# test the function
mask = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1])
question_numbers = np.array([1, 10, 11])
print(predict(mask, question_numbers))

# test using the source labels
question_numbers = df_source_labels['question_number'].values
print(predict(mask, question_numbers[:10]))

[1 0 1]
[1 1 1 1 1 1 1 1 1 1]


In [7]:
def calculate_score(y_true:np.array, y_pred:np.array):
    """
    Calculates the score for the predictions
    """
    precision, recall,  f1,  support = precision_recall_fscore_support(
        y_true, y_pred, average='macro', zero_division=0)
    
    accuracy = accuracy_score(y_true, y_pred)

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

# test the function
limit = df_source_labels.shape[0]
mask = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1])
question_numbers = df_source_labels['question_number'].values[:limit]

y_true = df_source_labels['correct'].values[:limit]
y_pred = predict(mask, question_numbers)

print(calculate_score(y_true, y_pred))


{'accuracy': 0.7312967522238258, 'precision': 0.6715991736706568, 'recall': 0.6143595035672145, 'f1': 0.6222529252106556}


## Brute Force Search

In [8]:
def evaluate_mask(mask, question_numbers, y_true):
    """
    Evaluate a single binary mask
    """
    # calculate the score
    y_pred = predict(mask, question_numbers)
    score = calculate_score(y_true, y_pred)

    # add the mask to the score results
    score['mask'] = mask

    # return the score and the mask
    return score

In [9]:
# get the question numbers and the correct answers
limit = df_source_labels.shape[0]
question_numbers = df_source_labels['question_number'].values[:limit]
y_true = df_source_labels['correct'].values[:limit]

# get all the possible masks
masks = np.array([np.array([int(x) for x in bin(i)[2:].zfill(18)]) for i in range(2**18)])

# define the partial function
evaluate_mask_partial = partial(evaluate_mask, question_numbers=question_numbers, y_true=y_true)

result = process_map(
    evaluate_mask_partial,
    masks,
    max_workers=mp.cpu_count(),
    chunksize=10)

# create a dataframe of the results
logging.info('Creating results dataframe')
df_results = pd.DataFrame(result)
with pd.option_context('display.max_columns', None):
    display(df_results.head(3))

  0%|          | 0/262144 [00:00<?, ?it/s]

2023-03-17 08:59:54 INFO     Creating results dataframe


Unnamed: 0,accuracy,precision,recall,f1,mask
0,0.296054,0.148027,0.5,0.228427,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,0.34612,0.630576,0.532874,0.305978,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,0.316642,0.490122,0.497513,0.27469,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [10]:
df_results.to_csv('../data/brute_force_results.csv', index=False)

In [22]:
f1_top_10 = df_results.sort_values(by='f1', ascending=False).head(10)
acc_top_10 = df_results.sort_values(by='accuracy', ascending=False).head(10)

with pd.option_context('display.max_columns', None):
    print('F1 TOP 10')
    display(f1_top_10)
    print('Mask: ', f1_top_10.iloc[0]['mask'])

    print()
    print('Accuracy TOP 10')
    display(acc_top_10)
    print('Mask: ', acc_top_10.iloc[0]['mask'])

F1 TOP 10


Unnamed: 0,accuracy,precision,recall,f1,mask
252631,0.71344,0.653025,0.647306,0.649845,"[1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, ..."
252503,0.697418,0.645616,0.655269,0.64924,"[1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, ..."
253527,0.71012,0.648887,0.643322,0.645788,"[1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, ..."
252629,0.692853,0.64048,0.649793,0.643948,"[1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, ..."
253655,0.726142,0.663213,0.635359,0.643387,"[1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, ..."
252501,0.676831,0.638341,0.657756,0.641218,"[1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, ..."
252615,0.690098,0.637381,0.646488,0.640755,"[1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, ..."
253653,0.705554,0.643197,0.637846,0.640209,"[1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, ..."
253525,0.689532,0.636744,0.645809,0.640098,"[1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, ..."
121559,0.688551,0.63564,0.644633,0.638961,"[0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, ..."


Mask:  [1 1 1 1 0 1 1 0 1 0 1 1 0 1 0 1 1 1]

Accuracy TOP 10


Unnamed: 0,accuracy,precision,recall,f1,mask
262103,0.731339,0.684059,0.587227,0.58575,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, ..."
261847,0.731297,0.671599,0.61436,0.622253,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, ..."
262111,0.729448,0.729481,0.557775,0.533201,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, ..."
261855,0.729405,0.679164,0.584907,0.582768,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, ..."
253911,0.726184,0.662396,0.608226,0.615065,"[1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, ..."
253655,0.726142,0.663213,0.635359,0.643387,"[1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, ..."
253919,0.724293,0.666223,0.578774,0.574885,"[1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, ..."
253663,0.72425,0.658916,0.605907,0.612347,"[1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, ..."
261079,0.718638,0.648813,0.599174,0.604457,"[1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, ..."
260823,0.718595,0.652297,0.626306,0.63356,"[1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, ..."


Mask:  [1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1]
