# Recommendation - voting system
For each article in the user’s history, get the `NR_SIMILAR` most similar articles. For each article in the pool of selected similar articles, calculate a final similarity score by adding up all similarities to all articles in the user’s history (if they are in the similarities dataframe, which means they should be in the top `NR_TO_KEEP` (see `similarities.ipynb`) similar articles).

In [None]:
import threading
from functions import *

In [None]:
NR_ROWS_PER_THREAD = 50000                                                              # nr of customers to process in each thread
NR_SIMILAR = 3                                                                          # get the top NR_SIMILAR articles for each article
SIMILARITY_ARTICLE_ID_FILE = ''                                                         # input file (containing the similar article ids)
SIMILARITY_SCORE_FILE = SIMILARITY_ARTICLE_ID_FILE.replace('article_ids', 'indices')    # input file (containing the similarity scores)
OUTPUT_FILE_NAME = f'submission_voting_system_{NR_SIMILAR}.csv'                         # name of the submission file
VALIDATE_CUSTOMER = 3

In [None]:
article_df = pd.read_feather(idp(filename='articles_processed.feather'))
no_image_article_ids = article_df[article_df['image_name'] == 'does not exist']['article_id'].values.tolist()
most_popular_article_ids = article_df.sort_values(by='popularity', ascending=False).head(12)['article_id'].values.tolist()

customers_transactions_df = pd.read_feather(idp(filename='customers_transactions_processed.feather'))
nr_customers = customers_transactions_df.shape[0]

## Submission creation pipeline
The pipeline uses multithreading to reduce the execution time.

In [None]:
def get_recommendations_voting_system(customer_purchase_hist, similar_article_dict, similar_article_score):
    """
    Get the recommendations for a single customer based on its purchase history.
    :param customer_purchase_hist: a string with space-separated article_ids, representing the purchase history (first = least recently purchased, last = most recently purchased)
    :param similar_article_dict: a dictionary mapping each article to a tuple with NR_TO_KEEP (see similarities.ipynb) article_ids that are considered as the 50 most similar articles
    :param similar_article_score: a dictionary mapping each article to a tuple with NR_TO_KEEP (see similarities.ipynb) values that correspond to the similarity scores
    :return: a string with 12 recommendations
    """
    # remove all duplicates and all article_ids that are not linked to an image
    customer_purchase_hist = list(dict.fromkeys(
        [c for c in article_str_to_list(article_id_str=customer_purchase_hist) if c not in no_image_article_ids]
    ))
    # create a candidate list
    candidate_list = most_popular_article_ids.copy()
    for i in range(3):
        candidate_list.extend([similar_article_dict[history][i] for history in customer_purchase_hist])

    article_sim_pairs = [
        t
        for article_id in customer_purchase_hist
        for t in list(zip(similar_article_dict[article_id], similar_article_score[article_id]))
        if t[0] in candidate_list
    ]
    candidate_dict = {
        candidate: sum([x[1] for x in article_sim_pairs if x[0] == candidate])
        for candidate in candidate_list
    }
    final_candidates = [y[0] for y in sorted(list(candidate_dict.items()), key=lambda x: x[1], reverse=True)]

    return article_list_to_str(article_id_list=final_candidates[:12])

def create_voting_system_recommendations_thread_function(min_row_ind, max_row_ind, thread_nr, similar_article_dict, similar_article_score):
    """
    Apply the get_recommendations_advanced_similarity function to a batch of customers (defined by min_row_ind and max_row_ind).
    Write the result to a temporary partial submission file.
    :param min_row_ind: smallest row index in the range to retrieve
    :param max_row_ind: largest row index in the range to retrieve
    :param thread_nr: nr of the thread
    :param similar_article_dict: a dictionary mapping each article to a tuple with 50 article_ids that are considered as the 50 most similar articles
    :param similar_article_score: a dictionary mapping each article to a tuple with 50 scores that correspond to the similarity scores
    """

    print(f"[=>    ] Started              : Thread {thread_nr} ({min_row_ind} --> {max_row_ind})")
    part_of_df = customers_transactions_df.iloc[min_row_ind:max_row_ind].copy()
    part_of_df['prediction'] = part_of_df['purchase_history'].apply(
        lambda hist: get_recommendations_voting_system(
            customer_purchase_hist=hist, similar_article_dict=similar_article_dict, similar_article_score=similar_article_score
        )
    )
    part_of_submission = part_of_df[['customer_id', 'prediction']]
    part_of_submission.to_csv(submission_odp(filename=f'submission_{thread_nr}.csv', creation=True), index=False)
    print(f"[=====>] Finished             : Thread {thread_nr} ({min_row_ind} --> {max_row_ind})")
    return

def run_threaded_advanced_similarity_recommendations():
    """
    Find recommendations for all customers in the dataset
    :param extended: use the similarities based on the extended embeddings
    """

    min_row_ind, max_row_ind = 0, NR_ROWS_PER_THREAD     # lower and upperbound of rows to extract within a thread
    thread_nr = 1                                        # only for progress printing
    threads = list()

    similarity_df_article_ids = pd.read_feather(odp(filename=SIMILARITY_ARTICLE_ID_FILE))
    similarity_article_dict = get_similar_article_dict(
        similarity_df_article_ids=similarity_df_article_ids,
        similarity_levels=list(range(1, similarity_df_article_ids.shape[1]))
    )
    similarity_score_dict = get_similar_score_dict(
        similarity_df_article_ids=similarity_df_article_ids,
        similarity_df_sim=pd.read_feather(odp(filename=SIMILARITY_SCORE_FILE)),
        similarity_levels=list(range(1, similarity_df_article_ids.shape[1]))
    )

    # create threads
    while nr_customers > min_row_ind:
        print("Main    : created and started thread %d", thread_nr)
        # create and start thread
        thread = threading.Thread(
            target=create_voting_system_recommendations_thread_function,
            args=(min_row_ind, max_row_ind, thread_nr, similarity_article_dict, similarity_score_dict)
        )
        threads.append(thread)
        thread.start()
        # update parameters
        min_row_ind, max_row_ind = max_row_ind, min(nr_customers, max_row_ind + NR_ROWS_PER_THREAD)
        thread_nr += 1

    # join threads
    for thread_index, thread in enumerate(threads):
        print("Main    : next thread to join: %d.", thread_index + 1)
        thread.join()
        print("Main    : thread %d done", thread_index + 1)

    return thread_nr - 1

In [None]:
%%time

nr_threads = run_threaded_advanced_similarity_recommendations()

In [None]:
%%time

join_partial_submissions(
    base_file_name=submission_odp(filename=f'submission_*.csv', creation=True),
    trgt_file_name=submission_odp(OUTPUT_FILE_NAME),
    nr_files=nr_threads
)

In [None]:
remove_directory_if_exists(directory=odp(filename='submission_creation'))

## Subjective validation
Check the format of the submission file and visually assess the recommendations.

In [None]:
%%time

# check whether the submission file is valid
check_submission(
    filename=submission_odp(filename=OUTPUT_FILE_NAME),
    nr_customers=nr_customers
)

In [None]:
%%time

# randomly show any customer case
random_show_customer_case(
    filename=submission_odp(filename=OUTPUT_FILE_NAME),
    customer_transactions_df=customers_transactions_df
)

In [None]:
%%time

# show the customer at index VALIDATE_CUSTOMER
show_customer_case(
    customer_id=VALIDATE_CUSTOMER,
    filename=submission_odp(filename=OUTPUT_FILE_NAME),
    customer_transactions_df=customers_transactions_df
)

In [None]:
%reset -f