In [1]:
# Load the packages

import polars as pl
import os
from model2vec import StaticModel   
import torch
import numpy as np
from sklearn.decomposition import PCA

In [2]:
# First we need to load the data on diagnoses

diagn_embed = pl.read_parquet("embed_diagnoses_updated_prevs.parquet")

# Discard the unneccesary columns

diagn_embed = diagn_embed.select([col for col in diagn_embed.columns if not col.startswith("diagn_embed")])

In [3]:
# Now we load the data on procedures

procs_embed = pl.read_parquet("embed_procedures_renamed_aggr.parquet")

# Discard the unneccesary columns

procs_embed = procs_embed.select([col for col in procs_embed.columns if not col.startswith("proc_embed")])

In [4]:
# Load the other metadata now (df_august.parquet)

# Load the data

data = pl.read_parquet("df_august.parquet")

In [5]:
# Now join the data (metadata + procedures)

data_final = data.join(procs_embed, on= ["PT_ID","CSN"], how="left")

In [6]:
# Fill the null values of Aggregated_Procedures with "NA"

data_final = data_final.with_columns(
    pl.when(pl.col("Aggregated_Procedures").is_null())
    .then(pl.lit("NA"))
    .otherwise(pl.col("Aggregated_Procedures"))
    .alias("Aggregated_Procedures")  # This ensures we keep the same column name
)

In [None]:
# Now we join the data with the diagnoses

data_final = data_final.join(diagn_embed, on= ["PT_ID","CSN"], how="left")

data_final.head()

In [8]:
# Now I want to create a new column that will be a string of all the diagnoses + procedures + biomarker values

data_final = data_final.with_columns([
    pl.concat_str([
        pl.lit("Previous medical procedures: "),
        pl.col("Aggregated_Procedures").fill_null("NA"),
        pl.lit(" Previous diagnoses: "),
        pl.col("Previous_Diagnoses").fill_null("NA"),
        pl.lit(". Biomarker values: Hemoglobin: "),
        pl.col("Hemoglobin").cast(pl.String).fill_null("NA"),
        pl.lit(", Leukocytes: "),
        pl.col("Leukocytes").cast(pl.String).fill_null("NA"),
        pl.lit(", Trombocytes: "),
        pl.col("Trombocytes").cast(pl.String).fill_null("NA"),
        pl.lit(", Kreatinin: "),
        pl.col("Kreatinin").cast(pl.String).fill_null("NA"),
        pl.lit(", ALAT: "),
        pl.col("ALAT").cast(pl.String).fill_null("NA"),
        pl.lit(", LDH: "),
        pl.col("LDH").cast(pl.String).fill_null("NA"),
        pl.lit(", Albumin: "),
        pl.col("Albumin").cast(pl.String).fill_null("NA"),
        pl.lit(", CRP: "),
        pl.col("CRP").cast(pl.String).fill_null("NA"),
        pl.lit(", Laktak_ab: "),
        pl.col("Laktak_ab").cast(pl.String).fill_null("NA"),
        pl.lit(", Troponin: "),
        pl.col("Troponin").cast(pl.String).fill_null("NA"),
        pl.lit(", Laktat_vb: "),
        pl.col("Laktat_vb").cast(pl.String).fill_null("NA")
    ], separator="").alias("Aggregated_Information")
])

# Then drop the Aggregated_Procedures and Previous_Diagnoses columns
data_final = data_final.drop(["Aggregated_Procedures", "Previous_Diagnoses"])

In [14]:
# Select the column of aggregated information

embeddings_data = data_final.select(["Aggregated_Information"]).to_series().to_list()

# Now we can create the embeddings

model = StaticModel.from_pretrained("hs-hf/jina-embeddings-v3-distilled") # Use Jina-embeddings_v3 but a distilled version

embeddings = model.encode(embeddings_data,device = "cuda")

In [16]:
# Check the shape of the embeddings

print(embeddings.shape)

# Now we need to add the embeddings to the data

# Create a list of Series objects, one for each embedding dimension
embedding_series = []
for i in range(embeddings.shape[1]):
    # Create a Series with the embedding values for this dimension
    col_name = f"embed_{i}"
    embedding_series.append(pl.Series(col_name, embeddings[:, i]))

# Add all embedding columns to the DataFrame at once
data_final = data_final.with_columns(embedding_series)

(1708336, 512)


In [19]:
# Z-score normalization for each column
embedding_cols = [f"embed_{i}" for i in range(embeddings.shape[1])]

data_final = data_final.with_columns([
    ((pl.col(col) - pl.col(col).mean()) / pl.col(col).std()).alias(col)
    for col in embedding_cols
])

In [21]:
# Now we need to do PCA on the embedding columns (keep 30 components)

# Get the list of embedding column names
embedding_cols = [f"embed_{i}" for i in range(embeddings.shape[1])]

# Extract the embedding columns as a numpy array for PCA
embeddings_array = data_final.select(embedding_cols).to_numpy()

# Initialize and fit PCA
pca = PCA(n_components=30)
pca_result = pca.fit_transform(embeddings_array)

# Create column names for PCA components
pca_cols = [f"pca_{i}" for i in range(30)]

# Create Series objects for each PCA component
pca_series = [pl.Series(col, pca_result[:, i]) for i, col in enumerate(pca_cols)]

# Drop the original embedding columns and add PCA components
data_final = data_final.drop(embedding_cols).with_columns(pca_series)


In [25]:
# Save the data

data_final.write_parquet("embeddings_df_weights.parquet")