In [1]:
import pandas as pd
import os
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.cluster import HDBSCAN
import time
from tqdm import tqdm
from sklearn.preprocessing import normalize
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import calinski_harabasz_score,silhouette_score
from sklearn.decomposition import PCA
from umap import UMAP
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
questions = np.load(os.path.join(Path.cwd().parent,'data', "question_embeddings.npy"))
tags = pd.read_parquet(os.path.join(Path.cwd().parent,'data', "tag_embeddings.parquet"))

In [5]:
tags

Unnamed: 0_level_0,embedding
tag,Unnamed: 1_level_1
c#,"[0.015589412, 0.0028190461, -0.02159284, -0.00..."
entity-framework,"[0.013604954, -0.017427877, 0.008887872, -0.02..."
ntp,"[0.0025991949, 0.020412365, -0.025822664, -0.0..."
python,"[0.0044617336, 0.017571222, -0.028520588, -0.0..."
pandas,"[0.013557544, 0.023968168, -0.04845796, -0.009..."
...,...
gocardless,"[0.031818543, 0.0014768182, 0.0064690523, -0.0..."
daxstudio,"[0.01692661, -0.008786624, -0.008296905, -0.02..."
woocommerce-email,"[0.032131393, -0.0161308, 0.029701, 0.01068102..."
automapper-6,"[0.01038144, 0.00035326037, 0.0029597834, -0.0..."


In [None]:
class DualEncoderMatcher:
    def __init__(self, question_embeddings, tag_embeddings, tag_names):
        self.question_embeddings = question_embeddings 
        self.tag_embeddings = tag_embeddings            
        self.tag_names = tag_names
        
        self.question_embeddings_norm = self._normalize(question_embeddings)
        self.tag_embeddings_norm = self._normalize(tag_embeddings)
    
    def _normalize(self, embeddings):
        """L2 normalize embeddings"""
        norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
        return embeddings / (norms + 1e-8)
    
    def predict_top_k(self, question_idx, k=5):
        similarities = self.question_embeddings_norm[question_idx] @ self.tag_embeddings_norm.T
        
        top_k_indices = np.argsort(similarities)[-k:][::-1]
        top_k_tags = [self.tag_names[i] for i in top_k_indices]
        top_k_scores = similarities[top_k_indices]
        
        return list(zip(top_k_tags, top_k_scores))
    
    def batch_predict(self, question_indices, k=5):
        similarities = self.question_embeddings_norm[question_indices] @ self.tag_embeddings_norm.T
        
        top_k_indices = np.argsort(similarities, axis=1)[:, -k:][:, ::-1]
        
        predictions = []
        for i, _ in enumerate(question_indices):
            top_tags = [self.tag_names[idx] for idx in top_k_indices[i]]
            top_scores = similarities[i, top_k_indices[i]]
            predictions.append(list(zip(top_tags, top_scores)))
        
        return predictions


In [22]:
questions[:10_000]

array([[ 0.00135186, -0.01567405, -0.00407692, ..., -0.00051256,
        -0.01407612,  0.03170738],
       [ 0.00947397, -0.01458311, -0.01913991, ...,  0.01237694,
         0.0026381 ,  0.00367426],
       [-0.00196639, -0.02462533, -0.02752738, ..., -0.02915011,
        -0.01131252,  0.00531276],
       ...,
       [ 0.00475327, -0.03546025, -0.02484306, ..., -0.00861502,
        -0.05014591,  0.0091939 ],
       [-0.01815439, -0.00208452, -0.01850394, ..., -0.00084887,
         0.00824582,  0.00753479],
       [ 0.03789256, -0.01389924, -0.02730731, ..., -0.01784343,
        -0.00101591, -0.00454621]], shape=(10000, 4096))

In [34]:
tags.embedding.values
t_stacked = np.stack(tags.embedding.values)
t_stacked.shape

(22753, 4096)

In [35]:
np.linalg.norm(t_stacked, axis=1, keepdims=True)

array([[0.99999972],
       [1.00000007],
       [0.99999997],
       ...,
       [0.99999902],
       [1.00000045],
       [1.00000032]], shape=(22753, 1))

In [26]:
questions[:10_000].shape

(10000, 4096)

In [24]:
np.linalg.norm(questions[:10_000], axis=1, keepdims=True)


array([[1.00000024],
       [0.99999981],
       [1.00000021],
       ...,
       [1.00000067],
       [1.00000047],
       [0.99999971]], shape=(10000, 1))

In [36]:
matcher = DualEncoderMatcher(questions[:10_000], t_stacked, tags.index)

In [37]:
predictions = matcher.predict_top_k(question_idx=0, k=5)
print(f"Top 5 predictions: {predictions}")


Top 5 predictions: [('entity-framework', np.float64(0.5814004526467879)), ('linq-to-entities', np.float64(0.5646984342057525)), ('entity-framework-6', np.float64(0.5354078775941149)), ('entity-framework-5', np.float64(0.5260456250104916)), ('entity-framework-4.1', np.float64(0.5250464008985329))]


In [38]:

batch_predictions = matcher.batch_predict(
    question_indices=range(1000),  # First 1000 questions
    k=5
)

In [40]:
batch_predictions[0]

[('entity-framework', np.float64(0.5814004526467882)),
 ('linq-to-entities', np.float64(0.5646984342057526)),
 ('entity-framework-6', np.float64(0.535407877594115)),
 ('entity-framework-5', np.float64(0.5260456250104918)),
 ('entity-framework-4.1', np.float64(0.5250464008985329))]

In [41]:
original_data = pd.read_csv(os.path.join(Path.cwd().parent,'data', 'stackexchange_dataset.csv'))

In [44]:
original_data.iloc[0]['question_text']

'I am looking for a better way to use the .Include clause of Entity Framework. I want to avoid duplicate code. I have a lot of classes, and every class has a method which looks something like this: query = (from n in currentDBContext.FBBuchungenCollection .Include(x => x.BelegHerkunft) .Include(x => x.Buchungsordner) .Include(x => x.Buchungsperiode).ThenInclude(x => x.Geschaeftsjahr) .Include(x => x.BuchungsUser) .Include(x => x.Erfassungsart) .Include(x => x.ErstellUser) .Include(x => x.Mandant).ThenInclude(x => x.HauptAdresse) .Include(x => x.StornoUser) .Include(x => x.Teilbuchungen).ThenInclude(x => x.FremdWaehrung) .Include(x => x.Teilbuchungen).ThenInclude(x => x.KKArt) .Include(x => x.Teilbuchungen).ThenInclude(x => x.KKKonto) .Include(x => x.Teilbuchungen).ThenInclude(x => x.Konto) .Include(x => x.Teilbuchungen).ThenInclude(x => x.KoReVerteilung).ThenInclude(x => x.Periodenverteilungen).ThenInclude(x => x.Kontierungen).ThenInclude(x => x.Kontierungsangaben).ThenInclude(x => x.K