# 02-03 : Brute Force Search

Use a brute force search to find a simple mask for "predicting" the correct answer.

This is taken from `21_feature_engineering` and the Kaggle notebook to prove the point:
[silly_monkey](https://www.kaggle.com/code/shambi/silly-monkey)

In [1]:
import sys
import numpy as np
import pandas as pd
import logging

from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

import multiprocessing as mp
from functools import partial

from tqdm.auto import tqdm
from tqdm.contrib.concurrent import process_map

## Configure Logging

In [2]:
logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    level=logging.INFO,
    datefmt='%Y-%m-%d %H:%M:%S',
        handlers=[
        logging.StreamHandler(sys.stdout)
    ])

logging.info("Started")

2023-03-17 08:36:33 INFO     Started


## Load Source Data

In [3]:
# load the source training labels
df_source_labels = pd.read_csv('../data/train_labels.csv')

print(df_source_labels.shape)
with pd.option_context('display.max_columns', None):
    display(df_source_labels.head(3))

(212022, 2)


Unnamed: 0,session_id,correct
0,20090312431273200_q1,1
1,20090312433251036_q1,0
2,20090314121766812_q1,1


In [4]:
# extract the question number
df_source_labels['question_number'] = df_source_labels.session_id \
    .str.extract(r'_q(\d+)') \
    .astype(int)

print(df_source_labels.shape)
with pd.option_context('display.max_columns', None):
    display(df_source_labels.sample(n=3))

(212022, 3)


Unnamed: 0,session_id,correct,question_number
110782,21030211171083080_q10,0,10
107261,20110411173015120_q10,0,10
203306,21010407271941800_q18,1,18


## Functions

In [5]:
def predict_single(model:np.array, question_number:int) -> int:
    """
    Predicts the answer to a single question
    """
    return model[question_number-1]

# test the function
mask = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1])

question_number = 1
print(f'Question {question_number}: {predict_single(mask, question_number)}')
question_number = 10
print(f'Question {question_number}: {predict_single(mask, question_number)}')
question_number = 11
print(f'Question {question_number}: {predict_single(mask, question_number)}')

Question 1: 1
Question 10: 0
Question 11: 1


In [6]:
def predict(model:np.array, question_numbers:np.array) -> np.array:
    """
    Predicts the answer to a list of questions
    """
    return np.array(
        [predict_single(model, question_number) for question_number in question_numbers])

# test the function
mask = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1])
question_numbers = np.array([1, 10, 11])
print(predict(mask, question_numbers))

# test using the source labels
question_numbers = df_source_labels['question_number'].values
print(predict(mask, question_numbers[:10]))

[1 0 1]
[1 1 1 1 1 1 1 1 1 1]


In [7]:
def calculate_score(y_true:np.array, y_pred:np.array):
    """
    Calculates the score for the predictions
    """
    precision, recall,  f1,  support = precision_recall_fscore_support(
        y_true, y_pred, average='macro', zero_division=0)
    
    accuracy = accuracy_score(y_true, y_pred)

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

# test the function
limit = df_source_labels.shape[0]
mask = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1])
question_numbers = df_source_labels['question_number'].values[:limit]

y_true = df_source_labels['correct'].values[:limit]
y_pred = predict(mask, question_numbers)

print(calculate_score(y_true, y_pred))


{'accuracy': 0.7312967522238258, 'precision': 0.6715991736706568, 'recall': 0.6143595035672145, 'f1': 0.6222529252106556}


## Brute Force Search

### Too Slow

In [8]:
# best_f1 = 0

# # get the question numbers and the correct answers
# question_numbers = df_source_labels['question_number'].values
# y_true = df_source_labels['correct'].values

# # try all possible masks
# for i in tqdm(range(2**18)):
#     # # convert the integer value to binary and pad with zeros to make it 18 digits long
#     # mask_str = bin(i)[2:].zfill(18)
    
#     # # create the mask
#     # mask = np.array([int(x) for x in mask_str])

#     mask = np.zeros(18, dtype=np.int32)
#     for j in range(18):
#         mask[j] = (i >> j) & 1

#     # calculate the score
#     y_pred = predict(mask, question_numbers)
#     score = calculate_score(y_true, y_pred)

#     # best f1 score so far
#     if score['f1'] > best_f1:
#         best_f1 = score['f1']
#         best_mask = mask
#         best_score = score

#         logging.info(f'New best f1 score: {best_f1:.4f} - {best_mask}')
    

### Paralel Processing

In [9]:
def evaluate_mask(mask, question_numbers, y_true):
    """
    Evaluate a single binary mask
    """
    # calculate the score
    y_pred = predict(mask, question_numbers)
    score = calculate_score(y_true, y_pred)

    # add the mask to the score results
    score['mask'] = mask

    # return the score and the mask
    return score

In [10]:
# get the question numbers and the correct answers
limit = df_source_labels.shape[0]
question_numbers = df_source_labels['question_number'].values[:limit]
y_true = df_source_labels['correct'].values[:limit]

# get all the possible masks
masks = np.array([np.array([int(x) for x in bin(i)[2:].zfill(18)]) for i in range(2**18)])

# define the partial function
evaluate_mask_partial = partial(evaluate_mask, question_numbers=question_numbers, y_true=y_true)

result = process_map(
    evaluate_mask_partial,
    masks,
    max_workers=mp.cpu_count(),
    chunksize=10)

# create a dataframe of the results
logging.info('Creating results dataframe')
df_results = pd.DataFrame(result)
with pd.option_context('display.max_columns', None):
    display(df_results.head(3))

  0%|          | 0/262144 [00:00<?, ?it/s]

In [None]:
# results = []

# # get the question numbers and the correct answers
# question_numbers = df_source_labels['question_number'].values
# y_true = df_source_labels['correct'].values

# # create a pool of worker processes
# num_processes = mp.cpu_count()
# pool = mp.Pool(processes=num_processes)

# # try all possible masks in parallel
# logging.info('Adding all the masks to the worker pool')
# for i in range(2**18):
#     # convert the integer value to a numpy array of binary digits
#     mask = np.zeros(18, dtype=np.int32)
#     for j in range(18):
#         mask[j] = (i >> j) & 1
        
#     # apply the function to the mask using the worker pool
#     result = pool.apply_async(evaluate_mask, args=(mask, question_numbers, y_true))
    
#     # append the result to the list
#     results.append(result)
    
# # close the worker pool and wait for all processes to finish
# logging.info('Waiting for all the workers to finish')
# pool.close()
# pool.join()