# Minimal workflow

- Make sure you have ollama running https://ollama.com/ 

In [1]:
import sys
import os
from os import path,listdir
import pandas as pd

pd.options.display.max_columns = 500
import warnings
warnings.filterwarnings('ignore')

import scripts
from scripts import translate_chop,clean_translations,embed_chop,parse_ops,translate_ops,embed_ops
from scripts import map_chop_ops

In [2]:
input_chop = 'data/chop.min.csv' #minimal file
output_chop = 'outputs/chop.min.csv'
output_chop_clean = 'outputs/chop.min.clean.csv'

output_embedding = 'outputs/embeddings_chop.min.parquet'
output_corpus = 'outputs/corpus_chop.min.txt'
persist_dir = 'outputs/db'

input_ops_xml = 'data/ops2023syst_claml_20221021.xml'
input_ops_abbv = 'data/OPS_ABBV.tsv'
output_ops_parsed = 'outputs/ops_parsed.csv'

ops_min = 'outputs/ops_parsed.min.csv'
ops_translated = 'outputs/ops_translated.min.csv'
ops_translated_clean = 'outputs/ops_translated.min.clean.csv'
output_embedding_ops = 'outputs/embeddings_ops.min.parquet'
output_corpus_ops = 'outputs/corpus_ops.min.txt'

---
# Process CHOP
## Translate 

In [3]:
translate_chop.main(input_chop,output_chop)

Output: 


In [4]:
pd.read_csv(output_chop)

Unnamed: 0,nbchar,zcode,item type,text,codable,emitter,status,modification date,indent level,lateralite,clean_text,translation_raw
0,1,C0,T,Massnahmen und Interventionen nicht anderswo K...,non,,0,,1,,Massnahmen und Interventionen nicht anderswo K...,Measures and Interventions Not Otherwise Class...
1,2,Z00,T,Massnahmen und Interventionen nicht anderswo k...,non,,0,,2,,Massnahmen und Interventionen nicht anderswo k...,Measures and Interventions Not Otherwise Class...
2,3,Z00.0,T,Therapeutischer Ultraschall,non,,0,,3,,Therapeutischer Ultraschall,Therapeutic Ultrasound|The application of ultr...
3,3,Z00.0,X,Diagnostischer Ultraschall (nicht-invasiv) (88...,non,,0,,3,,Diagnostischer Ultraschall,Diagnostic Ultrasound|A medical imaging techni...
4,3,Z00.0,X,Intrakardiale Echokardiographie [IKE] (37.28),non,,0,,3,,Intrakardiale Echokardiographie [IKE],Transthoracic echocardiography (TTE)| A non-in...
...,...,...,...,...,...,...,...,...,...,...,...,...
94,4,Z00.39,B,"Computergesteuerte Chirurgie, n.n.bez.",non,,0,,4,,"Computergesteuerte Chirurgie, n.n.bez.","Computer-Assisted Surgery, unspecified|A surgi..."
95,3,Z00.4,T,Adjunktiv vaskuläre Massnahmen,non,,0,,3,,Adjunktiv vaskuläre Massnahmen,Adjunctive Vascular Measures|Additional proced...
96,3,Z00.4,N,Die Kodes unter 00.4- sind Zusatzkodes. Sie bi...,non,,0,,3,,Die Kodes unter 00.4- sind Zusatzkodes. Sie bi...,Codes under 00.4- are additional codes. They r...
97,3,Z00.4,S,Endarteriektomie (38.10 - 38.18.-),non,,0,,3,,Endarteriektomie,Endarterectomy|A surgical procedure to remove ...


## Clean translation

In [5]:
clean_translations.main(output_chop,output_chop_clean)

## Create embedding

In [4]:
embed_chop.main(output_chop_clean,output_embedding,output_corpus,persist_dir)

Embedding s/term: 0.04
# Wrote outputs/embeddings_chop.min.parquet
# Wrote outputs/corpus_chop.min.txt


---
# Process OPS

In [5]:
parse_ops.main(input_ops_xml,input_ops_abbv,output_ops_parsed)

# Wrote outputs/ops_parsed.csv


## Create minimal file

In [7]:
%%bash
cat outputs/ops_parsed.csv | head -n 10 > outputs/ops_parsed.min.csv

## Translate

In [4]:
translate_ops.main(ops_min,ops_translated)

Output: 


## Clean translation

In [7]:
clean_translations.main(ops_translated,ops_translated_clean)

## Clean embedding

In [3]:
embed_ops.main(ops_translated_clean,output_embedding_ops,output_corpus_ops,persist_dir)

Embedding s/term: 0.21
# Wrote outputs/embeddings_ops.min.parquet
# Wrote outputs/corpus_ops.min.txt


---
# Map codebooks

In [4]:
output_mapping = 'outputs/ops_to_chop.csv'
map_chop_ops.main(output_embedding_ops,output_embedding,output_mapping,top_k=3)
df = pd.read_csv(output_mapping)
df

Unnamed: 0,A_df,B_df,similarity
0,0,87,0.864476
1,0,57,0.742353
2,0,59,0.722445
3,1,87,0.772411
4,1,3,0.697928
5,1,19,0.673854
6,2,87,0.767221
7,2,3,0.686891
8,2,19,0.666431
9,3,87,0.658893


In [3]:
output_mapping = 'outputs/chop_to_ops.csv'
map_chop_ops.main(output_embedding,output_embedding_ops,output_mapping,top_k=3)
df = pd.read_csv(output_mapping)
df

Unnamed: 0,A_df,B_df,similarity
0,0,0,0.721629
1,0,4,0.617236
2,0,1,0.592342
3,1,0,0.646939
4,1,4,0.569626
...,...,...,...
292,97,0,0.532027
293,97,6,0.525459
294,98,0,0.549949
295,98,4,0.520464
