In [None]:
import os
import sentence_transformers
# Specify the path

path = "/home/alex/ews/procedures/unzipped_procedures"

os.chdir(path)

print("Current working directory: ", os.getcwd()) # And here we can check it

In [2]:
# What I want to do is take the column of aggregated procedures and turn it into a list

import os
from pathlib import Path
from model2vec import StaticModel
from sentence_transformers import SentenceTransformer
import polars as pl
import torch

# Set up the output directory
output_dir = "/home/alex/ews/procedures/unzipped_procedures"

# List all processed batch files
processed_files = list(Path(output_dir).glob("processed_batch_*.parquet"))

In [None]:
processed_files

In [4]:
from tqdm import tqdm
from datetime import datetime
import numpy as np

In [7]:
# This is not important here
# We basically use a sentence transformer to extract embeddings
# Those embeddings extracted will not be used!!

model = SentenceTransformer("tomaarsen/static-similarity-mrl-multilingual-v1", truncate_dim=30, device = "cuda")

In [None]:
import polars as pl
import numpy as np
from datetime import datetime
from tqdm import tqdm

def process_batches(processed_files, model, verbose_every=50):
    """Process batch files and return a Polars DataFrame with PT_IDs, CSNs, Aggregated_Procedures, and embeddings.

    Args:
        processed_files: List of parquet files to process
        model: Embedding model
        verbose_every: Print detailed info every N batches (default: 50)
    """

    all_pt_ids = []
    all_csn = []
    all_aggregated_procedures = []
    all_embeddings = []

    total_batches = len(processed_files)
    start_time = datetime.now()

    # Process each batch file with progress bar
    for idx, batch_file in enumerate(tqdm(processed_files, desc="Processing batches")):
        # Read parquet file
        df = pl.read_parquet(batch_file).select(["PT_ID", "CSN", "Aggregated_Procedures"])

        # Get procedures and compute embeddings
        procedures_list = df['Aggregated_Procedures'].to_list()
        embeddings_batch = model.encode(
            procedures_list,
            show_progress_bar=False,
            device="cuda",
            normalize_embeddings=True
        )

        # Store PT_IDs, CSNs, Aggregated_Procedures, and embeddings
        all_pt_ids.extend(df['PT_ID'].to_list())
        all_csn.extend(df['CSN'].to_list())
        all_aggregated_procedures.extend(df['Aggregated_Procedures'].to_list())
        all_embeddings.append(embeddings_batch)

        # Print verbose information every N batches
        if (idx + 1) % verbose_every == 0:
            current_time = datetime.now()
            elapsed_time = current_time - start_time
            avg_time_per_batch = elapsed_time / (idx + 1)
            remaining_batches = total_batches - (idx + 1)
            estimated_remaining_time = remaining_batches * avg_time_per_batch

            print(f"\n{'='*80}")
            print(f"Batch Progress Report ({idx + 1}/{total_batches}):")
            print(f"Current batch size: {len(procedures_list)} procedures")
            print(f"Total PT_IDs processed: {len(all_pt_ids)}")
            print(f"Memory usage of embeddings: {sum(e.nbytes for e in all_embeddings)/1e9:.2f} GB")
            print(f"Time elapsed: {elapsed_time}")
            print(f"Estimated time remaining: {estimated_remaining_time}")
            print(f"Average time per batch: {avg_time_per_batch.total_seconds():.2f} seconds")
            print(f"{'='*80}\n")

        # Clear memory
        del df
        del procedures_list
        del embeddings_batch

    # Combine all embeddings
    final_embeddings = np.concatenate(all_embeddings, axis=0)

    # Create embedding column names
    embedding_cols = [f"embedding_{i}" for i in range(final_embeddings.shape[1])]

    # Create Polars DataFrame
    embeddings_df = pl.DataFrame(
        {
            "PT_ID": all_pt_ids,
            "CSN": all_csn,
            "Aggregated_Procedures": all_aggregated_procedures,
            **{col: final_embeddings[:, i] for i, col in enumerate(embedding_cols)}
        }
    )

    # Print final summary
    total_time = datetime.now() - start_time
    print(f"\n{'='*80}")
    print("Final Processing Summary:")
    print(f"Total batches processed: {total_batches}")
    print(f"Total PT_IDs processed: {len(all_pt_ids)}")
    print(f"Final DataFrame shape: {embeddings_df.shape}")
    print(f"Total processing time: {total_time}")
    print(f"Average time per batch: {total_time.total_seconds()/total_batches:.2f} seconds")
    print(f"{'='*80}\n")

    return embeddings_df

# Usage
embeddings_df = process_batches(processed_files, model, verbose_every=50)

In [None]:
embeddings_df.filter(pl.col("PT_ID") == "Z1000")

In [None]:
# Set up new current directory

path = "/home/alex/ews/procedures"

os.chdir(path)

print("Current working directory: ", os.getcwd()) # And here we can check it

In [11]:
embeddings_df.write_parquet("embed_procedures.parquet")

In [12]:
# Now let's modify the procedures embeddings

embed_procedures = pl.scan_parquet("embed_procedures.parquet")

In [11]:
# Let's define a function first: 

def rename_embedding_columns(df: pl.LazyFrame) -> pl.LazyFrame:
    # Extract all column names
    columns = df.columns
    
    # Create new column names list
    new_columns = (
        columns[:3] +  # Keep first two columns as they are
        [f"proc_embed_{i}" for i in range(len(columns) - 2)]  # Rename the rest
    )
    
    # Use rename with dict comprehension
    rename_dict = dict(zip(columns, new_columns))
    return df.rename(rename_dict)

embed_procedures = rename_embedding_columns(embeddings_df)


In [15]:
embed_procedures.write_parquet("embed_procedures_renamed_aggr.parquet")