# Preprocessing
Preparing the data for embedding creation, similarity calculation and recommendation.

In [8]:
import os
import pandas as pd
from functions import idp, odp

## What preprocessing is required
Direct frame manipulation:
- **Transactions**: the transactions dataframe is very large (3 178 8324 records). In general, only the number of transactions is needed.
- **Articles**: articles are important components in the prediction. An `image_name` and `popularity` column are added to the frame.
- **Customers**: customers have a purchase history which is extracted from the transactions dataframe.

More advanced frame manipulation:
- **Cold start**: need backup recommendations for users with no or only a few purchases

## Transactions
Loading the transactions dataframe is only with purpose of some manipulations on articles or customers.

In [9]:
# transactions
transactions_df = pd.read_feather(idp('transactions_train.feather'))
transactions_df['article_id'] = '0' + transactions_df['article_id']
nr_transactions = transactions_df.shape[0]

## Articles
Two important aspects:
- Add `image_name` column to the dataframe
- Add `popularity` column to the dataframe

In [10]:
# articles
article_df = pd.read_feather(idp('articles.feather'))
article_df['article_id'] = '0' + article_df['article_id']
article_df['image_name'] = article_df.apply(lambda row: f'{row["article_id"][:3]}/{row["article_id"]}.jpg', axis=1)
article_df['image_name'] = article_df['image_name'].apply(lambda file_name: file_name if os.path.isfile(idp(f'images/{file_name}')) else 'does not exist')
article_ids_without_image = article_df[article_df['image_name'] == 'does not exist']['article_id'].values.tolist()
article_df = article_df.reset_index(drop=True)
print(article_df.shape)

popularity_df = pd.DataFrame(transactions_df['article_id'].value_counts())
popularity_df = popularity_df.reset_index()
popularity_df = popularity_df.rename(columns={'index': 'article_id', 'article_id': 'popularity'})
popularity_df['popularity'] = popularity_df['popularity'] / nr_transactions
popularity_df['popularity'] = popularity_df['popularity'] / popularity_df['popularity'].max()
article_df = article_df.merge(popularity_df, on='article_id', how='outer')      # outer join to not exclude articles that are never purchased
article_df['popularity'] = article_df['popularity'].fillna(0.0)                 # non-purchased articles are not popular at all, so assign popularity 0
print(article_df.shape)

article_df.to_feather(idp('articles_processed.feather'))
nr_articles = article_df.shape[0]
article_df.head()

(105542, 26)
(105542, 27)


Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc,image_name,popularity
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,0,Solid,9,Black,...,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.,010/0108775015.jpg,0.215583
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,0,Solid,10,White,...,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.,010/0108775044.jpg,0.144172
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1,Stripe,11,Off White,...,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.,010/0108775051.jpg,0.004275
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,0,Solid,9,Black,...,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde...",011/0110065001.jpg,0.020761
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,0,Solid,10,White,...,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde...",011/0110065002.jpg,0.010718


## Customers
Extract purchase history

In [12]:
# customers
customer_df = pd.read_feather(idp('customers.feather'))
customer_id_df = pd.DataFrame(customer_df['customer_id'].copy())

# create a customer transaction dataframe
# STEP 1: fetch transactions from the transactions dataframe
customer_transactions_df = transactions_df.groupby(['customer_id'], as_index=False).agg({'article_id': ' '.join})
customer_transactions_df = customer_transactions_df.rename(columns={'article_id': 'purchase_history'})
# STEP 2: determine all customers that did not have any transaction
no_purchase_customers_df = customer_id_df[~customer_id_df['customer_id'].isin(customer_transactions_df['customer_id'])].copy()
no_purchase_customers_df['purchase_history'] = ''
# STEP 3: join both dataframes to cover all customers
customer_transactions_df = pd.concat([customer_transactions_df, no_purchase_customers_df], ignore_index=True)
# STEP 4: fill NaN ages with the mean age
customer_transactions_df['age'] = customer_transactions_df['age'].fillna(round(customer_transactions_df['age'].mean())).astype(int)
# STEP 5: reverse the history such that the most recent purchase comes first
customer_transactions_df['purchase_history'] = customer_transactions_df['purchase_history'].apply(lambda x: ' '.join(reversed(x.split(' '))))

customer_transactions_df.to_feather(idp('customer_transactions_processed.feather'))

nr_customers = customer_df.shape[0]
customer_transactions_df.head()

Unnamed: 0,customer_id,purchase_history
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0625548001 0176209023 0627759010 0697138006 05...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0583558001 0639677008 0640244003 0521269001 06...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0663713001 0541518023 0663713001 0578020002 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0742079001 0732413001
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0634249005 0677049001 0698286003 0707704003 03...


## Cold start
For a cold start, I decided not to recommend just the popular items of the last 4 weeks, but the popular items within a window of 5 around the current age of a customer. Thus, customers with age 33 that need additional recommendations will get the most popular items within the age window 31 - 35 in the last 4 weeks. Items from last week have more weights than items in the weeks before.

In [None]:
# some constants
NR_RECENT_WEEKS = 4
WINDOW = 5

In [None]:
# step 1: read in the transactions dataframe and find the transactions made in the last NR_RECENT_WEEKS weeks
transaction_df = pd.read_feather(idp('transactions_train.feather'))
transaction_df = transaction_df.drop(columns=['price', 'sales_channel_id'])
transaction_df['article_id'] = '0' + transaction_df['article_id'].astype(str)
recent_transaction_df = transaction_df[transaction_df['week'] > transaction_df['week'].max() - NR_RECENT_WEEKS]
print(f'{round(100 * recent_transaction_df.shape[0] / transaction_df.shape[0], 2)}% of the transactions took place in the last {NR_RECENT_WEEKS} week(s)')

In [None]:
# step 2: we need to couple ages to the subset of transactions these ages must come from the customers
customer_df = pd.read_feather(idp('customers.feather'))
customer_df = customer_df.drop(
    columns=['FN', 'Active', 'club_member_status', 'fashion_news_frequency', 'postal_code']
)
customer_df['age'] = customer_df['age'].fillna(round(customer_df['age'].mean()))
customer_df['age'] = customer_df['age'].astype(int)
recent_transaction_df = recent_transaction_df.merge(customer_df, how='left', on='customer_id')

In [None]:
# step 3: we need the popularity for each article purchased in the last NR_RECENT_WEEKS weeks
popularity_df = pd.DataFrame(recent_transaction_df['article_id'].value_counts())
popularity_df = popularity_df.reset_index()
popularity_df = popularity_df.rename(columns={'index': 'article_id', 'article_id': 'popularity'})
popularity_df['popularity'] = popularity_df['popularity'] / recent_transaction_df.shape[0]
popularity_df['popularity'] = popularity_df['popularity'] / popularity_df['popularity'].max()

In [None]:
# step 4: ideally, articles purchased in the last week are more likely to get bought next week than articles purchased 2 weeks ago,
# thus multiply the popularity of an article by a factor defined by the week the item is bought in 1 for last week, 0.9 for the week before,...
recent_transaction_df = recent_transaction_df.merge(popularity_df, how='left', on='article_id')
recent_transaction_df['week_factor'] = \
    1 - ((NR_RECENT_WEEKS - (recent_transaction_df['week'] - recent_transaction_df['week'].min() + 1)) * 0.1)
recent_transaction_df['popularity'] *= recent_transaction_df['week_factor']
recent_transaction_df = recent_transaction_df.drop(columns=['customer_id', 'week', 'week_factor'])

In [None]:
# step 5
def get_popular_articles_for_age(window_min_age, window_max_age):
    """
    Function that returns the 12 most popular articles for a given age window within the last NR_RECENT_WEEKS weeks.
    In case the age window is too small to obtain 12 unique articles, the window is enlarged by 2 and the process is
    repeated. In case the age window goes beyond the minimum age or above the maximum age, the window is shifted such
    that it only covers ages occurring in the dataset while having the same window size as before.
    :param window_min_age: the lower bound of the age window
    :param window_max_age: the upper bound of the age window
    :return: a list with the 12 most popular articles for the given age window
    """
    subset_df = pd.DataFrame()
    while subset_df.shape[0] < 12:
        if window_min_age < recent_transaction_df['age'].min():
            diff = recent_transaction_df['age'].min() - window_min_age
            window_min_age = recent_transaction_df['age'].min()
            window_max_age += diff
        elif window_max_age > recent_transaction_df['age'].max():
            diff = window_max_age - recent_transaction_df['age'].max()
            window_max_age = recent_transaction_df['age'].max()
            window_min_age -= diff
        subset_df = recent_transaction_df[recent_transaction_df['age'].isin(range(window_min_age, window_max_age + 1))]
        subset_df = subset_df.sort_values(by='popularity', ascending=False)
        subset_df = subset_df.drop_duplicates(subset=['article_id'], keep='first')
        subset_df = subset_df.reset_index()
        window_min_age -= 1
        window_max_age -= 1
    return subset_df.loc[:11]['article_id'].tolist()

# loop over all ages and get recommendations based on the age window defined by age - X --> age + X, where
# the total size of the window is equal to WINDOW (in practice, the value of X can be derived by WINDOW // 2)
results = []
for age in range(recent_transaction_df['age'].min(), recent_transaction_df['age'].max() + 1):
    article_ids = get_popular_articles_for_age(
        window_min_age=age - WINDOW // 2,
        window_max_age=age + WINDOW // 2
    )
    results.append(article_ids)

In [None]:
# step 6: finally, create a dataframe for the recommendations and store the frame
age_recommendation_df = pd.DataFrame({
    'age': range(recent_transaction_df['age'].min(), recent_transaction_df['age'].max() + 1)
})
columns = list(zip(*results))
for i in range(12):
    age_recommendation_df[f'recommendation_{i + 1}'] = columns[i]
age_recommendation_df.to_feather(odp('cold_start_recommendations.feather'))

In [14]:
%reset -f