# Imports

In [58]:
import pandas as pd
import numpy as np

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split


# Load data

In [6]:
df_products = pd.read_csv("../data/csv/products_2025-10-18.csv")
df_products

Unnamed: 0,data_id,price,brand,category,product_name,units,quantity,unit_type,store_name,image_url,start_date,end_date
0,10764946,8.0,Hello Sensitive,Babypleje,Vådservietter,1,100,pk.,Netto,https://static.tilbudsugen.dk/1st-retail/2025/...,2025-10-18,2025-10-24
1,10764948,10.0,Hello Sensitive,Babypleje,Skumklude,1,50,pk.,Netto,https://static.tilbudsugen.dk/1st-retail/2025/...,2025-10-18,2025-10-24
2,10764968,89.0,Fitness Pharma,Kosttilskud,Mineraler,1,250,bg.,Netto,https://static.tilbudsugen.dk/1st-retail/2025/...,2025-10-18,2025-10-24
3,10764969,89.0,Fitness Pharma,Kosttilskud,Kosttilskud,1,200,ds.,Netto,https://static.tilbudsugen.dk/1st-retail/2025/...,2025-10-18,2025-10-24
4,10764967,89.0,Fitness Pharma,Fiskeolie,Fiskeolie,1,200,ds.,Netto,https://static.tilbudsugen.dk/1st-retail/2025/...,2025-10-18,2025-10-24
...,...,...,...,...,...,...,...,...,...,...,...,...
388,10764691,40.0,,Cykeltilbehør,Cykelovertræk,1,0,stk.,Netto,https://static.tilbudsugen.dk/1st-retail/2025/...,2025-10-18,2025-10-24
389,10764694,40.0,,Cykeltilbehør,Cykellås,1,0,stk.,Netto,https://static.tilbudsugen.dk/1st-retail/2025/...,2025-10-18,2025-10-24
390,10764696,30.0,,Cykeltilbehør,Cykelkurvovertræk,1,0,stk.,Netto,https://static.tilbudsugen.dk/1st-retail/2025/...,2025-10-18,2025-10-24
391,10764709,119.0,,Boligtekstiler,Duge,1,0,stk.,Netto,https://static.tilbudsugen.dk/1st-retail/2025/...,2025-10-18,2025-10-24


# Load models

## Encoder

In [3]:
model_name = "KennethTM/MiniLM-L6-danish-encoder-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_encoder = AutoModelForCausalLM.from_pretrained(model_name)
nlp_pipeline = pipeline("text-generation", model=model_encoder, tokenizer=tokenizer, device='cpu')

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`
Some weights of BertLMHeadModel were not initialized from the model checkpoint at KennethTM/MiniLM-L6-danish-encoder-v2 and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


## Translator

In [7]:
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-da-en", device='cpu')

Device set to use cpu


# Embed products

In [5]:
# Given a query
query = ['Kører der cykler på vejen?']

# And two passages
passage = ['I Danmark er cykler et almindeligt transportmiddel, og de har lige så stor ret til at bruge vejene som bilister. Cyklister skal dog følge færdselsreglerne og vise hensyn til andre trafikanter.', 
           'Solen skinner, og himlen er blå. Der er ingen vind, og temperaturen er perfekt. Det er den perfekte dag til at tage en tur på landet og nyde den friske luft.']

# Compute embeddings
model = SentenceTransformer("KennethTM/MiniLM-L6-danish-encoder-v2")
query_embeddings = model.encode(query)
passage_embeddings = model.encode(passage)

# To find most relevant passage for the query (closer to 1 means more similar)
cosine_scores = cos_sim(query_embeddings, passage_embeddings)
print(cosine_scores)

tensor([[0.6316, 0.2021]])


  return forward_call(*args, **kwargs)


# ML procedure

## Create labels

In [None]:
pd.set_option('display.max_rows', 500)
display(df_products[['price','brand','category','product_name']].sort_values('product_name'))
pd.set_option('display.max_rows', 20)

In [23]:
preferences = [180, 54, 330, 282, 157, 218] # index
df_products['preference'] = df_products.index.isin(preferences).astype(int) * 1
df_products

Unnamed: 0,data_id,price,brand,category,product_name,units,quantity,unit_type,store_name,image_url,start_date,end_date,preference
0,10764946,8.0,Hello Sensitive,Babypleje,Vådservietter,1,100,pk.,Netto,https://static.tilbudsugen.dk/1st-retail/2025/...,2025-10-18,2025-10-24,0
1,10764948,10.0,Hello Sensitive,Babypleje,Skumklude,1,50,pk.,Netto,https://static.tilbudsugen.dk/1st-retail/2025/...,2025-10-18,2025-10-24,0
2,10764968,89.0,Fitness Pharma,Kosttilskud,Mineraler,1,250,bg.,Netto,https://static.tilbudsugen.dk/1st-retail/2025/...,2025-10-18,2025-10-24,0
3,10764969,89.0,Fitness Pharma,Kosttilskud,Kosttilskud,1,200,ds.,Netto,https://static.tilbudsugen.dk/1st-retail/2025/...,2025-10-18,2025-10-24,0
4,10764967,89.0,Fitness Pharma,Fiskeolie,Fiskeolie,1,200,ds.,Netto,https://static.tilbudsugen.dk/1st-retail/2025/...,2025-10-18,2025-10-24,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
388,10764691,40.0,,Cykeltilbehør,Cykelovertræk,1,0,stk.,Netto,https://static.tilbudsugen.dk/1st-retail/2025/...,2025-10-18,2025-10-24,0
389,10764694,40.0,,Cykeltilbehør,Cykellås,1,0,stk.,Netto,https://static.tilbudsugen.dk/1st-retail/2025/...,2025-10-18,2025-10-24,0
390,10764696,30.0,,Cykeltilbehør,Cykelkurvovertræk,1,0,stk.,Netto,https://static.tilbudsugen.dk/1st-retail/2025/...,2025-10-18,2025-10-24,0
391,10764709,119.0,,Boligtekstiler,Duge,1,0,stk.,Netto,https://static.tilbudsugen.dk/1st-retail/2025/...,2025-10-18,2025-10-24,0


## Data processing

In [24]:
df_products_processed = df_products.dropna(subset=['product_name']).reset_index(drop=True)
df_products_processed['translated_name'] = df_products_processed['product_name'].apply(lambda x: translator(x)[0]['translation_text'])

In [25]:
df_products_processed

Unnamed: 0,data_id,price,brand,category,product_name,units,quantity,unit_type,store_name,image_url,start_date,end_date,preference,translated_name
0,10764946,8.0,Hello Sensitive,Babypleje,Vådservietter,1,100,pk.,Netto,https://static.tilbudsugen.dk/1st-retail/2025/...,2025-10-18,2025-10-24,0,Wet napkins
1,10764948,10.0,Hello Sensitive,Babypleje,Skumklude,1,50,pk.,Netto,https://static.tilbudsugen.dk/1st-retail/2025/...,2025-10-18,2025-10-24,0,Foam cloths
2,10764968,89.0,Fitness Pharma,Kosttilskud,Mineraler,1,250,bg.,Netto,https://static.tilbudsugen.dk/1st-retail/2025/...,2025-10-18,2025-10-24,0,Minerals
3,10764969,89.0,Fitness Pharma,Kosttilskud,Kosttilskud,1,200,ds.,Netto,https://static.tilbudsugen.dk/1st-retail/2025/...,2025-10-18,2025-10-24,0,Food supplements
4,10764967,89.0,Fitness Pharma,Fiskeolie,Fiskeolie,1,200,ds.,Netto,https://static.tilbudsugen.dk/1st-retail/2025/...,2025-10-18,2025-10-24,0,Fish oil
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
388,10764691,40.0,,Cykeltilbehør,Cykelovertræk,1,0,stk.,Netto,https://static.tilbudsugen.dk/1st-retail/2025/...,2025-10-18,2025-10-24,0,Bicycle Coating
389,10764694,40.0,,Cykeltilbehør,Cykellås,1,0,stk.,Netto,https://static.tilbudsugen.dk/1st-retail/2025/...,2025-10-18,2025-10-24,0,Bikelock
390,10764696,30.0,,Cykeltilbehør,Cykelkurvovertræk,1,0,stk.,Netto,https://static.tilbudsugen.dk/1st-retail/2025/...,2025-10-18,2025-10-24,0,Bicycle basket cover
391,10764709,119.0,,Boligtekstiler,Duge,1,0,stk.,Netto,https://static.tilbudsugen.dk/1st-retail/2025/...,2025-10-18,2025-10-24,0,Tablecloths


In [52]:
onehot = ['brand','category','unit_type','store_name']

preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown='ignore'), onehot),
    ("price", StandardScaler(), ["price"]),
    ("product_name", TfidfVectorizer(max_features=50), "translated_name")
])


## Data split

In [None]:
train, test = train_test_split(df_products_processed, test_size=0.2, random_state=42)

## Train model

In [77]:
X = preprocessor.fit_transform(train).toarray()
user_profile = X[(train["preference"] == 1).values].mean(axis=0).reshape(1,-1)

## Evaluate model

In [84]:
df_userprofile = pd.DataFrame(user_profile, columns=preprocessor.get_feature_names_out())
df_userprofile

Unnamed: 0,cat__brand_Americano,cat__brand_Amora,cat__brand_Athena,cat__brand_Bacon,cat__brand_Bakersfield,cat__brand_Becel,cat__brand_Biotex,cat__brand_Blanding 72,cat__brand_Blanding 95,cat__brand_Bonduelle,...,product_name__size,product_name__skyr,product_name__snacks,product_name__soda,product_name__vanilla,product_name__white,product_name__wine,product_name__with,product_name__yarn,product_name__yogurt
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [80]:
X_new = preprocessor.transform(test).toarray()
similarities = cosine_similarity(X_new, user_profile)

test["score"] = similarities
recommendations = test.sort_values("score", ascending=False)
display(recommendations.head(5))

Unnamed: 0,data_id,price,brand,category,product_name,units,quantity,unit_type,store_name,image_url,start_date,end_date,preference,translated_name,score
141,10764497,18.0,Hatting,Sandwich & Toastbrød,Klapper Havre Sandwich,1,0,ps.,Netto,https://static.tilbudsugen.dk/1st-retail/2025/...,2025-10-18,2025-10-24,0,Claps Oats Sandwich,0.752987
289,10764735,10.0,Saltlinser,Chokolade,Dragée,1,0,ps.,Netto,https://static.tilbudsugen.dk/1st-retail/2025/...,2025-10-18,2025-10-24,0,Dragée,0.72428
90,10764684,30.0,Mutti,Grønt,Tomater,1,1,pk.,Netto,https://static.tilbudsugen.dk/1st-retail/2025/...,2025-10-18,2025-10-24,0,Tomatoes,0.705453
333,10764627,10.0,La Campagna,"Pålæg, skiveskåret",Mortadella,1,0,pk.,Netto,https://static.tilbudsugen.dk/1st-retail/2025/...,2025-10-18,2025-10-24,0,Mortadella,0.70262
82,10764974,11.0,Nordic Spirit,"Sæbe, bad, dusch",Flydende sæbe,1,0,stk.,Netto,https://static.tilbudsugen.dk/1st-retail/2025/...,2025-10-18,2025-10-24,0,Liquid soap,0.684841
