In [None]:
# Load the necessary libraries

import polars as pl
import numpy as np
import torch
import os
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA

In [None]:
# Now we have to create the sentences

os.chdir("/home/alex/ews/NEWS2_Evaluation")

data_final = pl.read_parquet("data_final.parquet")

# This creates the sentence for the medical procedures and previous diagnoses

data_final = data_final.with_columns([
    pl.concat_str([
        pl.lit("Tidligere medicinske procedurer: "),
        pl.col("Aggregated_Procedures").fill_null("Ingen"),
        pl.lit(" Tidligere diagnoser: "),
        pl.col("Previous_Diagnoses").fill_null("Ingen")
    ], separator="").alias("Aggregated_Information")
])

# Then drop the Aggregated_Procedures and Previous_Diagnoses columns
data_final = data_final.drop(["Aggregated_Procedures", "Previous_Diagnoses"])

In [None]:
# Select the column of aggregated information

embeddings_data = data_final.select(["Aggregated_Information"]).to_series().to_list()

# Specify the model2vec multilingual model 

model = SentenceTransformer("minishlab/potion-multilingual-128M",device = "cuda")

# Extract the embeddings

embeddings = model.encode(embeddings_data,show_progress_bar=True)

# Create a list of Series objects, one for each embedding dimension
embedding_series = []
for i in range(embeddings.shape[1]):
    col_name = f"embed_{i}"
    embedding_series.append(pl.Series(col_name, embeddings[:, i]))

# Convert the list of Series into a DataFrame
embeddings_df = pl.DataFrame(embedding_series)

# Add them to our data

data_final = data_final.with_columns(embedding_series)

In [None]:
# Now we need to do PCA on the embedding columns (keep 60 components)

# Get the list of embedding column names
embedding_cols = [f"embed_{i}" for i in range(embeddings.shape[1])]

# Extract the embedding columns as a numpy array for PCA
embeddings_array = data_final.select(embedding_cols).to_numpy()

# Initialize and fit PCA
pca = PCA(n_components=60)
pca_result = pca.fit_transform(embeddings_array)

# Create column names for PCA components
pca_cols = [f"pca_{i}" for i in range(60)]

# Create Series objects for each PCA component
pca_series = [pl.Series(col, pca_result[:, i]) for i, col in enumerate(pca_cols)]

# Drop the original embedding columns and add PCA components
data_final = data_final.drop(embedding_cols).with_columns(pca_series)

data_final.write_parquet("finalized_with_embeddings.parquet")

# I also want to save the PCA components to a parquet file

pca_df = pl.DataFrame(pca_series)

pca_df.write_parquet("pca_only.parquet")