# Preprocessing
Preparing the data for embedding creation, similarity calculation and recommendation.

In [8]:
import json
import os
import pandas as pd
from functions import idp

## What preprocessing is required
- **Transactions**: the transactions dataframe is very large (3 178 8324 records). In general, only the number of transactions is needed.
- **Articles**: articles are important components in the prediction. An `image_name` and `popularity` column are added to the frame. A subframe with the 100 most popular articles is exported.
- **Customers**: customers have a purchase history which is extracted from the transactions dataframe.

In [9]:
# transactions
transactions_df = pd.read_feather(idp('transactions_train.feather'))
transactions_df['article_id'] = '0' + transactions_df['article_id']
nr_transactions = transactions_df.shape[0]

In [10]:
# articles
article_df = pd.read_feather(idp('articles.feather'))
article_df['article_id'] = '0' + article_df['article_id']
article_df['image_name'] = article_df.apply(lambda row: f'{row["article_id"][:3]}/{row["article_id"]}.jpg', axis=1)
article_df['image_name'] = article_df['image_name'].apply(lambda file_name: file_name if os.path.isfile(idp(f'images/{file_name}')) else 'does not exist')
article_ids_without_image = article_df[article_df['image_name'] == 'does not exist']['article_id'].values.tolist()
article_df = article_df.reset_index(drop=True)
print(article_df.shape)

popularity_df = pd.DataFrame(transactions_df['article_id'].value_counts())
popularity_df = popularity_df.reset_index()
popularity_df = popularity_df.rename(columns={'index': 'article_id', 'article_id': 'popularity'})
popularity_df['popularity'] = popularity_df['popularity'] / nr_transactions
popularity_df['popularity'] = popularity_df['popularity'] / popularity_df['popularity'].max()
article_df = article_df.merge(popularity_df, on='article_id', how='outer')      # outer join to not exclude articles that are never purchased
article_df['popularity'] = article_df['popularity'].fillna(0.0)                 # non-purchased articles are not popular at all, so assign popularity 0
print(article_df.shape)

article_df.to_feather(idp('articles_processed.feather'))
nr_articles = article_df.shape[0]
article_df.head()

(105542, 26)
(105542, 27)


Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc,image_name,popularity
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,0,Solid,9,Black,...,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.,010/0108775015.jpg,0.215583
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,0,Solid,10,White,...,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.,010/0108775044.jpg,0.144172
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1,Stripe,11,Off White,...,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.,010/0108775051.jpg,0.004275
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,0,Solid,9,Black,...,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde...",011/0110065001.jpg,0.020761
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,0,Solid,10,White,...,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde...",011/0110065002.jpg,0.010718


In [11]:
# extract the 100 most popular articles
most_popular_article_df = article_df.sort_values(by='popularity', ascending=False).head(100)

In [12]:
# customers
customer_df = pd.read_feather(idp('customers.feather'))
customer_id_df = pd.DataFrame(customer_df['customer_id'].copy())

# create a customer transaction dataframe
# STEP 1: fetch transactions from the transactions dataframe
customer_transactions_df = transactions_df.groupby(['customer_id'], as_index=False).agg({'article_id': ' '.join})
customer_transactions_df = customer_transactions_df.rename(columns={'article_id': 'purchase_history'})
# STEP 2: determine all customers that did not have any transaction
no_purchase_customers_df = customer_id_df[~customer_id_df['customer_id'].isin(customer_transactions_df['customer_id'])].copy()
no_purchase_customers_df['purchase_history'] = ''
# STEP 3: join both dataframes to cover all customers
customer_transactions_df = pd.concat([customer_transactions_df, no_purchase_customers_df], ignore_index=True)
customer_transactions_df.to_feather(idp('customer_transactions_processed.feather'))

nr_customers = customer_df.shape[0]
customer_transactions_df.head()

Unnamed: 0,customer_id,purchase_history
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0625548001 0176209023 0627759010 0697138006 05...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0583558001 0639677008 0640244003 0521269001 06...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0663713001 0541518023 0663713001 0578020002 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0742079001 0732413001
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0634249005 0677049001 0698286003 0707704003 03...


In [13]:
# dictionary with nr of rows in each frame
json.dump({'transactions': nr_transactions, 'articles': nr_articles, 'customers': nr_customers}, open(idp('nr_rows_per_frame.json'), 'w'))

In [14]:
%reset -f