In [1]:
from ranker import EmbeddingRanker
from embeddings import create_image_embeddings, create_text_embeddings, concatenate_embeddings
from evaluation import evaluate, recall, average_bestseller_rank, example_similarity, example_customer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
BASE_PATH = ".."

In [4]:
def rank(embedding_type, test_week=104, reduction_size=10, rank_method="plain"):
    embeddings = f"{BASE_PATH}/data/embeddings/{embedding_type}_embeddings.parquet"
    index = f"{BASE_PATH}/data/indices/{embedding_type}_index_{reduction_size}.ann"
    ranker = EmbeddingRanker(test_week, BASE_PATH, embeddings, index, reduction_size)
    predictions = ranker.rank(rank_method)
    predictions.to_csv(f"{BASE_PATH}/submissions/{embedding_type}.csv.gz", index=False)

# Creating embeddings

The following cells generate embeddings using various input datasets and embedding strategies.
Respectively:
- image
- text, plain template, class token pooling
- text, descriptive template, class token pooling
- text, elaborative template, class token pooling
- text, plain template, average pooling
- text, descriptive template, average pooling
- text, elaborative template, average pooling
- concatenated embeddings, from image and text plain average embeddings

In [None]:
create_image_embeddings(BASE_PATH, "data/images", "data/embeddings/image_embeddings.parquet")

In [None]:
create_text_embeddings(BASE_PATH, "data/articles.csv", "data/embeddings/text_plain_embeddings.parquet", template="plain", average=False)

In [None]:
create_text_embeddings(BASE_PATH, "data/articles.csv", "data/embeddings/text_descriptive_embeddings.parquet", template="descriptive", average=False)

In [None]:
create_text_embeddings(BASE_PATH, "data/articles.csv", "data/embeddings/text_elaborative_embeddings.parquet", template="elaborative", average=False)

In [None]:
create_text_embeddings(BASE_PATH, "data/articles.csv", "data/embeddings/text_plain_average_embeddings.parquet", template="plain", average=True)

In [None]:
create_text_embeddings(BASE_PATH, "data/articles.csv", "data/embeddings/text_descriptive_average_embeddings.parquet", template="descriptive", average=True)

In [None]:
create_text_embeddings(BASE_PATH, "data/articles.csv", "data/embeddings/text_elaborative_average_embeddings.parquet", template="elaborative", average=True)

In [None]:
concatenate_embeddings(BASE_PATH, "data/embeddings/text_plain_average_embeddings.parquet", "data/embeddings/image_embeddings.parquet", "data/embeddings/concatenated_embeddings.parquet")

# Ranking embeddings

The next cells provide method calls recommending items to users using the embeddings created in the previous cells.
Listed are the parameters of the ranking:
- test week: week of recommendations
- reduction size: size of the embeddings used after applying PCA dimensionality reduction
- rank method: whether to add a set of popular items to each user profile before ranking

In [None]:
rank("text_plain")

In [None]:
rank("text_descriptive")

In [None]:
rank("text_elaborative")

In [None]:
rank("text_plain_average")

In [None]:
rank("text_descriptive_average")

In [None]:
rank("text_elaborative_average")

In [None]:
rank("image")

In [None]:
rank("concatenated")

# Evaluation

Use the MAP@12 metric to evaluate submissions. Only possible for weeks before week 105.

In [7]:
evaluate(BASE_PATH, "submissions/baseline.csv.gz")

0.025080605661718477

In [8]:
evaluate(BASE_PATH, "submissions/text_plain.csv.gz")

0.002661550323226012

In [9]:
evaluate(BASE_PATH, "submissions/text_descriptive.csv.gz")

0.0026303034621545916

In [10]:
evaluate(BASE_PATH, "submissions/text_elaborative.csv.gz")

0.0026880948298163266

In [11]:
evaluate(BASE_PATH, "submissions/text_plain_average.csv.gz")

0.0029882516573240753

In [12]:
evaluate(BASE_PATH, "submissions/text_descriptive_average.csv.gz")

0.002810966063604121

In [13]:
evaluate(BASE_PATH, "submissions/text_elaborative_average.csv.gz")

0.002837631340666684

In [14]:
evaluate(BASE_PATH, "submissions/image.csv.gz")

0.002358322486182535

In [15]:
evaluate(BASE_PATH, "submissions/concatenated.csv.gz")

0.0019908935129062085

# Recall

Use recall to evaluate submissions. Only possible for weeks before week 105.

In [16]:
recall(BASE_PATH, "submissions/baseline.csv.gz")

0.050023692224558845

In [17]:
recall(BASE_PATH, "submissions/text_plain.csv.gz")

0.004792798682462355

In [18]:
recall(BASE_PATH, "submissions/text_descriptive.csv.gz")

0.005156421484626903

In [19]:
recall(BASE_PATH, "submissions/text_elaborative.csv.gz")

0.004974840250357991

In [20]:
recall(BASE_PATH, "submissions/text_plain_average.csv.gz")

0.005501204882207673

In [21]:
recall(BASE_PATH, "submissions/text_descriptive_average.csv.gz")

0.00539605569123474

In [22]:
recall(BASE_PATH, "submissions/text_elaborative_average.csv.gz")

0.005469688229976411

In [23]:
recall(BASE_PATH, "submissions/image.csv.gz")

0.0034448756405003923

In [24]:
recall(BASE_PATH, "submissions/concatenated.csv.gz")

0.0031103364728088624

# Evaluation

Use the average bestseller rank to guage how much popularity was used to generate recomendations.

In [25]:
average_bestseller_rank(BASE_PATH, "submissions/baseline.csv.gz")

24.739998560159766

In [26]:
average_bestseller_rank(BASE_PATH, "submissions/text_plain.csv.gz")

274.1715304786346

In [27]:
average_bestseller_rank(BASE_PATH, "submissions/text_descriptive.csv.gz")

274.2201845885987

In [28]:
average_bestseller_rank(BASE_PATH, "submissions/text_elaborative.csv.gz")

274.8891417400807

In [29]:
average_bestseller_rank(BASE_PATH, "submissions/text_plain_average.csv.gz")

272.7340280062186

In [30]:
average_bestseller_rank(BASE_PATH, "submissions/text_descriptive_average.csv.gz")

273.3873839826865

In [31]:
average_bestseller_rank(BASE_PATH, "submissions/text_elaborative_average.csv.gz")

273.0164470719274

In [32]:
average_bestseller_rank(BASE_PATH, "submissions/image.csv.gz")

273.9967587981713

In [33]:
average_bestseller_rank(BASE_PATH, "submissions/concatenated.csv.gz")

273.5875300571965

# Examples

Methods to generate examples of similarity between items and user profiles.

In [34]:
article_id = 290519011
example_similarity(BASE_PATH, article_id)

article text: Basic co/fl cardigan, Cardigan, Garment Upper body, Solid, Turquoise, Medium Dusty, Turquoise, Baby basics, Baby Sizes 50-98, Baby/Children, Baby Essentials & Complements, Jersey Basic, Cardigan in soft organic cotton sweatshirt fabric with a stand-up collar, press-studs down the front, and ribbing around the neckline, cuffs and hem. Soft brushed inside.
most similar article on text embeddings: 290519017
most similar article text: Basic co/fl cardigan, Cardigan, Garment Upper body, Solid, Pink, Medium Dusty, Pink, Baby basics, Baby Sizes 50-98, Baby/Children, Baby Essentials & Complements, Jersey Basic, Cardigan in soft organic cotton sweatshirt fabric with a stand-up collar, press-studs down the front, and ribbing around the neckline, cuffs and hem. Soft brushed inside.
most similar article on image embeddings: 318951002


In [3]:
customer_id = 1288838156741401
example_customer(BASE_PATH, customer_id, "text_plain_average")

user_profile=array([627769007, 703296001, 705827001], dtype=int32)
recommendations=['762618002', '762618001', '872126001', '667444003', '791896001', '800141001', '872126007', '684326001', '895289001', '894330001', '762618004', '759970001']
user profile article texts: ['ELAINE parka, Outdoor Waistcoat, Garment Upper body, Solid, Dark Blue, Dark, Blue, Young Girl Outdoor, Children Sizes 134-170, Baby/Children, Kids Outerwear, Outdoor, Padded parka in woven fabric with a velvety finish. Detachable, pile-lined hood with a faux fur trim, zip down the front with a chin guard, and a concealed drawstring at the waist. Handwarmer pockets at the top, flap front pockets with a press-stud, and a single back vent. Quilted lining.', 'Nick Puffer, Jacket, Garment Upper body, Colour blocking, Red, Medium, Red, Jacket Casual, Menswear, Menswear, Mens Outerwear, Outdoor, Padded jacket in woven fabric with a fleece-lined stand-up collar and zip down the front. Zipped side pockets, one inner pocket with a

In [4]:
customer_id = 1288838156741401
example_customer(BASE_PATH, customer_id, "image")

user_profile=array([627769007, 703296001, 705827001], dtype=int32)
recommendations=['703296001', '653597001', '873419002', '801205001', '845918006', '678078001', '625196001', '625532005', '671564001', '819231002', '799973001', '842067002']
user profile article texts: ['ELAINE parka, Outdoor Waistcoat, Garment Upper body, Solid, Dark Blue, Dark, Blue, Young Girl Outdoor, Children Sizes 134-170, Baby/Children, Kids Outerwear, Outdoor, Padded parka in woven fabric with a velvety finish. Detachable, pile-lined hood with a faux fur trim, zip down the front with a chin guard, and a concealed drawstring at the waist. Handwarmer pockets at the top, flap front pockets with a press-stud, and a single back vent. Quilted lining.', 'Nick Puffer, Jacket, Garment Upper body, Colour blocking, Red, Medium, Red, Jacket Casual, Menswear, Menswear, Mens Outerwear, Outdoor, Padded jacket in woven fabric with a fleece-lined stand-up collar and zip down the front. Zipped side pockets, one inner pocket with a