# Recommendation - recency, pairwise
Get the 12 most recently purchased (unique) articles (with an image), and recommend for each of these 12 articles an article that is similar. If there are no 12 (unique) purchases for a user, the 12 most popular items are used to supplement such that in total 12 recommendations are obtained.

In [None]:
import threading
from functions import *

In [None]:
NR_ROWS_PER_THREAD = 6000                                                   # nr of customers to process in each thread
SIMILARITY_ARTICLE_ID_FILE = ''                                             # input file (containing the similar article ids)
SIMILARITY_LEVEL = 1                                                        # how similar should the pairwise match be (0 = same, 1 = most similar,...)
OUTPUT_FILE_NAME = f'submission_recency_pairwise_{SIMILARITY_LEVEL}.csv'    # name of the submission file
VALIDATE_CUSTOMER = 3                                                       # the index of a customer to validate

In [None]:
article_df = pd.read_feather(idp(filename='articles_processed.feather'))
no_image_article_ids = article_df[article_df['image_name'] == 'does not exist']['article_id'].values.tolist()
most_popular_article_ids = article_df.sort_values(by='popularity', ascending=False).head(12)['article_id'].values.tolist()

customers_transactions_df = pd.read_feather(idp(filename='customers_transactions_processed.feather'))
nr_customers = customers_transactions_df.shape[0]

## Submission creation pipeline
The pipeline uses multithreading to reduce the execution time.

In [None]:
def get_recommendations_recency_pairwise(customer_purchase_hist, similar_article_dict, popular_article_list):
    """
    Get the recommendations for a single customer based on its purchase history, by looking at the 12 most recent purchases and finding the i-th most similar article to each of these 12 purchased articles. Supplement the recommendations with the most popular items (important in case the customers had < 12 purchases). For this function, it is not important what i we have (for the i-th most similar), because this information is already encoded in the similar_article_dict.
    :param customer_purchase_hist: a string with space-separated article_ids, representing the purchase history (first = least recently purchased, last = most recently purchased)
    :param similar_article_dict: a dictionary mapping each article to a tuple with 1 article_id that is considered as the i-th most similar article
    :param popular_article_list: a list of the 12 most popular article (ids)
    :return: a string with 12 recommendations
    """
    customer_purchase_hist = list(dict.fromkeys(
        [c for c in article_str_to_list(article_id_str=customer_purchase_hist) if c not in no_image_article_ids]
    ))[-12:]
    recommended_articles = list(dict.fromkeys([similar_article_dict[history][0] for history in customer_purchase_hist] + popular_article_list))
    return article_list_to_str(article_id_list=recommended_articles[:12])

def create_recency_pairwise_recommendations_thread_function(min_row_ind, max_row_ind, thread_nr, similar_article_dict, similarity_level):
    """
    Apply the get_recommendations_recency_pairwise function to a batch of customers (defined by min_row_ind and max_row_ind).
    Write the result to a temporary partial submission file.
    :param min_row_ind: smallest row index in the range to retrieve
    :param max_row_ind: largest row index in the range to retrieve
    :param thread_nr: nr of the thread
    :param similar_article_dict: dictionary mapping articles to their i-th similar article
    :param similarity_level: the similarity level i
    """

    print(f"[=>    ] Started              : Thread {thread_nr} ({min_row_ind} --> {max_row_ind})")
    part_of_df = customers_transactions_df.iloc[min_row_ind:max_row_ind].copy()
    part_of_df['prediction'] = part_of_df['purchase_history'].apply(
        lambda hist: get_recommendations_recency_pairwise(
            customer_purchase_hist=hist, similar_article_dict=similar_article_dict, popular_article_list=most_popular_article_ids
        )
    )
    part_of_submission = part_of_df[['customer_id', 'prediction']]
    part_of_submission.to_csv(submission_odp(filename=f'submission_{thread_nr}.csv', creation=True), index=False)
    print(f"[=====>] Finished             : Thread {thread_nr} ({min_row_ind} --> {max_row_ind})")
    return

def run_threaded_recency_pairwise_recommendations():
    """
    Find recommendations for all customers in the dataset
    :return: the number of threads
    """

    min_row_ind, max_row_ind = 0, NR_ROWS_PER_THREAD     # lower and upperbound of rows to extract within a thread
    thread_nr = 1                                        # only for progress printing
    threads = list()

    similar_article_dict = get_similar_article_dict(
        similarity_df_article_ids=pd.read_feather(odp(filename=SIMILARITY_ARTICLE_ID_FILE)),
        similarity_levels=[SIMILARITY_LEVEL]
    )

    # create threads
    while nr_customers > min_row_ind:
        print("Main    : created and started thread %d", thread_nr)
        # create and start thread
        thread = threading.Thread(
            target=create_recency_pairwise_recommendations_thread_function,
            args=(min_row_ind, max_row_ind, thread_nr, similar_article_dict, SIMILARITY_LEVEL)
        )
        threads.append(thread)
        thread.start()
        # update parameters
        min_row_ind, max_row_ind = max_row_ind, min(nr_customers, max_row_ind + NR_ROWS_PER_THREAD)
        thread_nr += 1

    # join threads
    for thread_index, thread in enumerate(threads):
        print("Main    : next thread to join: %d.", thread_index + 1)
        thread.join()
        print("Main    : thread %d done", thread_index + 1)

    return thread_nr -  1

In [None]:
%%time

nr_threads = run_threaded_recency_pairwise_recommendations()

In [None]:
%%time

join_partial_submissions(
    base_file_name=submission_odp(filename=f'submission_*.csv', creation=True),
    trgt_file_name=submission_odp(OUTPUT_FILE_NAME),
    nr_files=nr_threads
)

In [None]:
remove_directory_if_exists(directory=odp(filename='submission_creation'))

## Subjective validation
Check the format of the submission file and visually assess the recommendations.

In [None]:
%%time

# check whether the submission file is valid
check_submission(
    filename=submission_odp(filename=OUTPUT_FILE_NAME),
    nr_customers=nr_customers
)

In [None]:
%%time

# randomly show any customer case
random_show_customer_case(
    filename=submission_odp(filename=OUTPUT_FILE_NAME),
    customer_transactions_df=customers_transactions_df
)

In [None]:
%%time

# show the customer at index VALIDATE_CUSTOMER
show_customer_case(
    customer_id=VALIDATE_CUSTOMER,
    filename=submission_odp(filename=OUTPUT_FILE_NAME),
    customer_transactions_df=customers_transactions_df
)

In [None]:
%reset -f