In [1]:
import os
import typing as tp
import pandas as pd
import sentence_transformers

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_id: str = "sentence-transformers/all-MiniLM-L6-v2"
dataset_path: str = "/Users/chrnegor/Documents/study/UrTraining/ml/scripts/data/dataset.csv"

In [3]:
df = pd.read_csv(dataset_path)
train_df = df.sample(frac=0.8)
test_df = df.drop(train_df.index)

In [4]:
training_data = []

for idx, row in train_df.iterrows():
    query: str = row['user_profile_w_meta']
    positive: str = row['course_description']
    negatives: tp.List[str] = []
    while len(negatives) < 4:
        negative_item = df.sample(1)
        if abs(int(negative_item['course_id']) - int(row['course_id'])) > 3:
            negatives.append(negative_item['course_description'])
    
    for negative in negatives:
        training_data.append({
            'query': query,
            'positive': positive,
            'negative': negative
        })

  if abs(int(negative_item['course_id']) - int(row['course_id'])) > 3:


In [5]:
training_data = pd.DataFrame(training_data, columns=['query', 'positive', 'negative'])
training_data.to_csv("data/training_data.csv", index=False)

---

#### Training

In [6]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

model_id: str = "sentence-transformers/all-MiniLM-L6-v2"
dataset_path: str = "data/training_data.csv"

df = pd.read_csv(dataset_path)

train_examples = [
    InputExample(texts=[row['query'], row['positive'], row['negative']])
    for _, row in df.iterrows()
]
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
model = SentenceTransformer('all-MiniLM-L6-v2')
train_loss = losses.TripletLoss(model=model)

---
#### Metrics before training 

In [7]:
import json

def prepare_documents(raw_docs: tp.List[tp.Dict[str, tp.Any]]) -> tp.List[str]:
    docs: tp.List[str] = []
    for doc in raw_docs:
        formatted_doc = "\n".join([f"{k}: {str(v)}" for k, v in doc.items()])
        docs.append(formatted_doc)
    return docs

course_descriptions: tp.List[tp.Dict[str, tp.Any]] = json.load(
    open("data/200_sport_programs.json")
)

documents = prepare_documents(course_descriptions)

model = SentenceTransformer(model_id)
docs_embeddings = model.encode(documents)
query_embeddings = model.encode(test_df['user_profile_w_meta'].tolist())

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def calculate_metrics(query_embeddings, docs_embeddings, test_df, k=10):
    """Calculate hit rate, recall, and precision at k"""
    similarities = cosine_similarity(query_embeddings, docs_embeddings)
    
    hit_rate = 0
    total_recall = 0
    total_precision = 0
    
    for i, (_, row) in enumerate(test_df.iterrows()):
        query_similarities = similarities[i]
        top_k_indices = np.argsort(query_similarities)[::-1][:k]
        positive_course_id = int(row['course_id'])
        
        if positive_course_id in top_k_indices:
            hit_rate += 1
            
        relevant_in_topk = 1 if positive_course_id in top_k_indices else 0
        total_relevant = 1 
        
        recall = relevant_in_topk / total_relevant
        precision = relevant_in_topk / k
        
        total_recall += recall
        total_precision += precision
    
    num_queries = len(test_df)
    hit_rate = hit_rate / num_queries
    avg_recall = total_recall / num_queries
    avg_precision = total_precision / num_queries
    
    return hit_rate, avg_recall, avg_precision

hit_rate_before, recall_before, precision_before = calculate_metrics(
    query_embeddings, docs_embeddings, test_df
)

print(f"Metrics before training:")
print(f"Hit Rate@10: {hit_rate_before:.4f}")
print(f"Recall@10: {recall_before:.4f}")
print(f"Precision@10: {precision_before:.4f}")

Metrics before training:
Hit Rate@10: 0.0000
Recall@10: 0.0000
Precision@10: 0.0000


In [9]:
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=25,
    warmup_steps=10,
    show_progress_bar=True,
)

trained_model = model



Step,Training Loss
500,3.0768


---

### Model after training

In [10]:
docs_embeddings = trained_model.encode(documents)
query_embeddings = trained_model.encode(test_df['user_profile_w_meta'].tolist())

In [11]:
hit_rate_after, recall_after, precision_after = calculate_metrics(
    query_embeddings, docs_embeddings, test_df
)

print(f"Metrics after training:")
print(f"Hit Rate@10: {hit_rate_after:.4f}")
print(f"Recall@10: {recall_after:.4f}")
print(f"Precision@10: {precision_after:.4f}")

Metrics after training:
Hit Rate@10: 0.0870
Recall@10: 0.0870
Precision@10: 0.0087


In [12]:
hit_rate_improvement = hit_rate_after - hit_rate_before
recall_improvement = recall_after - recall_before
precision_improvement = precision_after - precision_before

print(f"\nMetrics Improvements:")
print(f"Hit Rate@10 improved by: {hit_rate_improvement:.4f}")
print(f"Recall@10 improved by: {recall_improvement:.4f}")
print(f"Precision@10 improved by: {precision_improvement:.4f}")


Metrics Improvements:
Hit Rate@10 improved by: 0.0870
Recall@10 improved by: 0.0870
Precision@10 improved by: 0.0087
