# Embeddings

**Note:** Install the *sentence-transformers* package to run this notebook. If you get an error that *sentencepiece* is missing, try restarting and rerunning the notebook.

In [None]:
from functools import lru_cache
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np

## Load Data

In [None]:
# define filepath
products_path = "./data/products_train.csv"

@lru_cache(maxsize=1)
def read_product_data():
    return pd.read_csv(products_path)

In [None]:
product_data = read_product_data()
product_data = product_data[product_data["locale"].isin(["DE", "UK", "JP"])]

product_data.head()

In [None]:
# seperate german, UK and japanese products and drop every column except for id and title
products_de = product_data[product_data["locale"] == "DE"][["id", "title"]]
products_uk = product_data[product_data["locale"] == "UK"][["id", "title"]]
products_jp = product_data[product_data["locale"] == "JP"][["id", "title"]]

## Load Models

Two transformer models are used to create the embeddings of the product titles:

- cross-en-de-roberta-sentence-transformer
    - https://huggingface.co/T-Systems-onsite/cross-en-de-roberta-sentence-transformer
    - creates embeddings for german and english texts
- luke-japanese-base-lite
    - https://huggingface.co/studio-ousia/luke-japanese-base-lite
    - creates embeddings for japanese texts


In [None]:
en_de_model = SentenceTransformer('T-Systems-onsite/cross-en-de-roberta-sentence-transformer')

In [None]:
jp_model = SentenceTransformer("studio-ousia/luke-japanese-base-lite")

## Compute Embeddings (Titles Only)

The embeddings are computed in batches and they are stored in three different files (one for each locale). Additionally, three logfiles are created, which store the last successfully processed batch. In case that the notebook crashed or is interrupted for some other reason, the logfile is read when the notebook is restarted and the processing starts after the last successfully processed batch.

**Note:** Add two subfolders named *embeddings* and *logfiles* to the data folder to execute the following code cells without errors.

In [None]:
batch_size = 2000
n_batches_de = len(products_de) // batch_size + 1
n_batches_uk = len(products_uk) // batch_size + 1
n_batches_jp = len(products_jp) // batch_size + 1

In [None]:
#########################
##### GERMAN LOCALE #####
#########################

# read logfile to look for last successfully processed batch
de_logfile_path = 'data/logfiles/de_embeddings_log.txt'
try:
    f = open(de_logfile_path, "r")
    prev_batch_idx_de = int(f.read())
    f.close()
except FileNotFoundError:
    prev_batch_idx_de = -1

if prev_batch_idx_de < n_batches_de-1:
    for batch_idx in range(prev_batch_idx_de+1, n_batches_de):

        print(f"processing batch {batch_idx+1} / {n_batches_de}")
        
        # ---read product data of this batch---
        first_row = batch_idx * batch_size
        last_row = first_row + batch_size
        batch_product_data = products_de.iloc[first_row:last_row]

        # ---compute embeddings---
        emb = en_de_model.encode(batch_product_data["title"].values)

        # ---store embeddings in dataframe---
        emb_df = pd.DataFrame(emb, index=batch_product_data.index)

        # ---write embeddings to csv file---
        # Set writing mode to append after first chunk
        mode = 'w' if batch_idx == 0 else 'a'
        # add header if it is the first chunk
        header = batch_idx == 0
        # write to file
        emb_df.to_csv(
            "data/embeddings/de_embeddings.csv",
            header=header,
            mode=mode
        )

        # write chunk index to log file
        f = open(de_logfile_path, "w")
        f.write(str(batch_idx))
        f.close()

In [None]:
#####################
##### UK LOCALE #####
#####################

# read logfile to look for last successfully processed batch
uk_logfile_path = 'data/logfiles/uk_embeddings_log.txt'
try:
    f = open(uk_logfile_path, "r")
    prev_batch_idx_uk = int(f.read())
    f.close()
except FileNotFoundError:
    prev_batch_idx_uk = -1

if prev_batch_idx_uk < n_batches_uk-1:
    for batch_idx in range(prev_batch_idx_uk+1, n_batches_uk):

        print(f"processing batch {batch_idx+1} / {n_batches_uk}")
        
        # ---read product data of this batch---
        first_row = batch_idx * batch_size
        last_row = first_row + batch_size
        batch_product_data = products_uk.iloc[first_row:last_row]

        # ---compute embeddings---
        emb = en_de_model.encode(batch_product_data["title"].values)

        # ---store embeddings in dataframe---
        emb_df = pd.DataFrame(emb, index=batch_product_data.index)

        # ---write embeddings to csv file---
        # Set writing mode to append after first chunk
        mode = 'w' if batch_idx == 0 else 'a'
        # add header if it is the first chunk
        header = batch_idx == 0
        # write to file
        emb_df.to_csv(
            "data/embeddings/uk_embeddings.csv",
            header=header,
            mode=mode
        )

        # write chunk index to log file
        f = open(uk_logfile_path, "w")
        f.write(str(batch_idx))
        f.close()

In [None]:
########################
##### JAPAN LOCALE #####
########################

# read logfile to look for last successfully processed batch
jp_logfile_path = 'data/logfiles/jp_embeddings_log.txt'
try:
    f = open(jp_logfile_path, "r")
    prev_batch_idx_jp = int(f.read())
    f.close()
except FileNotFoundError:
    prev_batch_idx_jp = -1

if prev_batch_idx_jp < n_batches_jp-1:
    for batch_idx in range(prev_batch_idx_jp+1, n_batches_jp):

        print(f"processing batch {batch_idx+1} / {n_batches_jp}")
        
        # ---read product data of this batch---
        first_row = batch_idx * batch_size
        last_row = first_row + batch_size
        batch_product_data = products_jp.iloc[first_row:last_row]

        # ---compute embeddings---
        emb = jp_model.encode(batch_product_data["title"].values)

        # ---store embeddings in dataframe---
        emb_df = pd.DataFrame(emb, index=batch_product_data.index)

        # ---write embeddings to csv file---
        # Set writing mode to append after first chunk
        mode = 'w' if batch_idx == 0 else 'a'
        # add header if it is the first chunk
        header = batch_idx == 0
        # write to file
        emb_df.to_csv(
            "data/embeddings/jp_embeddings.csv",
            header=header,
            mode=mode
        )

        # write chunk index to log file
        f = open(jp_logfile_path, "w")
        f.write(str(batch_idx))
        f.close()