In [1]:
# Load the necessary libraries

import polars as pl
import numpy as np
import torch
import os
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA

In [2]:
data_final = pl.read_parquet("df_without_embeddings.parquet")

In [None]:
data_final.head()

In [4]:
# This creates the sentence for the medical procedures and previous diagnoses

data_final = data_final.with_columns([
    pl.concat_str([
        pl.lit("Tidligere medicinske procedurer: "),
        pl.col("Aggregated_Procedures").fill_null("Ingen"),
        pl.lit(" Tidligere diagnoser: "),
        pl.col("Aggregated_Diagnoses").fill_null("Ingen")
    ], separator="").alias("Aggregated_Information")
])

# Then drop the Aggregated_Procedures and Previous_Diagnoses columns
data_final = data_final.drop(["Aggregated_Procedures", "Aggregated_Diagnoses"])

In [None]:
def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery: {query}'

# Select the column of aggregated information
embeddings_data = data_final.select(["Aggregated_Information"]).to_series().to_list()

# Specify the E5 multilingual model 
model = SentenceTransformer("intfloat/multilingual-e5-large-instruct", device="cuda")

# Define the task in Danish for medical trajectory analysis
task_description = "Givet en patients medicinske forløb med hospitalsbehandlinger og procedurer, repræsenter den til at finde lignende patientforløb"

# Add proper instruction format to each text
instructed_texts = [get_detailed_instruct(task_description, text) for text in embeddings_data]

# Extract the embeddings with instructed texts
embeddings = model.encode(instructed_texts, show_progress_bar=True, normalize_embeddings=True)

# Create a list of Series objects, one for each embedding dimension
embedding_series = []
for i in range(embeddings.shape[1]):
    col_name = f"embed_{i}"
    embedding_series.append(pl.Series(col_name, embeddings[:, i]))

# Convert the list of Series into a DataFrame
embeddings_df = pl.DataFrame(embedding_series)

# Add them to our data
data_final = data_final.with_columns(embedding_series)

In [None]:
# Get the list of embedding column names
embedding_cols = [f"embed_{i}" for i in range(embeddings.shape[1])]

# Extract the embedding columns as a numpy array for PCA
embeddings_array = data_final.select(embedding_cols).to_numpy()

# Initialize and fit PCA
pca = PCA(n_components=60)
pca_result = pca.fit_transform(embeddings_array)

# Create column names for PCA components
pca_cols = [f"pca_{i}" for i in range(60)]

# Create Series objects for each PCA component
pca_series = [pl.Series(col, pca_result[:, i]) for i, col in enumerate(pca_cols)]

# Drop the original embedding columns and add PCA components
data_final = data_final.drop(embedding_cols).with_columns(pca_series)

In [None]:
# Total variance explained by all 60 components (76.38%)
total_variance_explained = pca.explained_variance_ratio_.sum()
print(f"Total variance explained by 60 components: {total_variance_explained:.4f} ({total_variance_explained*100:.2f}%)")

In [15]:
data_final.write_parquet("df_with_embeddings_e5.parquet")

In [None]:
data_final.columns