In [None]:
import pandas as pd
import omop_mapping_utils as omp
from omop_mapping_utils import compare_dataframes
from itables import init_notebook_mode, show

init_notebook_mode(all_interactive=True)

filepath = "testdata"
df1 = pd.read_csv(f"{filepath}/ICD10fi.usagi.csv")
df2 = pd.read_csv(f"{filepath}/ICD10fi.fixedEF.usagi.csv")

df1.head(2)
show(df1, scrollY="200px", scrollCollapse=True, paging=False)

## Unit test data

In [None]:
testdata1 = {
    'sourceCode': [10, 20, 30, 40, 50, 50, 60, 80, 80],
    'sourceName': ['Same', 'Will be changed', 'Same 2', 'Not in 2', 
                   'Double 1 Same',  'Double 2 will change',
                   'Only in 1',
                   'Double 3 Same', 'Double 3 Same'
                  ],
    'conceptId': [10010, 10020, 10030, 10040, 10050, 10051, 10060, 10080, 10081],
    'conceptName': ['Same', 'Will be changed', 'Same 2', 'Not in 2', 
                   'Double 1 Same',  'Double 2 will change',
                   'Only in 1',
                   'Double 3 Same', 'Double 3 Same'
                   ]
}

testdata2 = {
    'sourceCode': [10, 20, 30, 40, 50, 50, 70, 80, 80],
    'sourceName': ['Same', 'Will be changed', 'Same 2', 'Not in 2', 
                   'Double 1 Same',  'Double 2 will change',
                   'Only in 2',
                   'Double 3 Same', 'Double 3 Same'],
    'conceptId': [10010, 10021, 10030, 10040, 10050, 10052, 10070, 10080, 10081],
    'conceptName': ['Same', 'Has changed', 'Same 2', 'Not in 2', 
                    'Double 1 Ssame',  'Double 2 has changed',
                    'Only in 2',
                    'Double 3 Same', 'Double 3 Same']
}
testdata1 = pd.DataFrame(testdata1)
testdata2 = pd.DataFrame(testdata2)

In [None]:
# Row level comparision: Simple merge using multiindex on sourceCode and conceptd
df1 = testdata1.copy()
df2 = testdata2.copy()

df1.set_index(['sourceCode', 'conceptId'], inplace=True)
df2.set_index(['sourceCode', 'conceptId'], inplace=True)
df1

df_merge = pd.merge(df1,df2, how='outer', left_index=True, right_index=True)
display(df_merge)

## Test functions: Set-based comparison

In [None]:
# compare_dfs as conceptId sets (usefull for one-to-many mappings)

# Reload the module to reflect any changes
importlib.reload(omop_mapping_utils)
# Now you can use the updated function
import omop_mapping_utils as omp
from omop_mapping_utils import compare_dataframes


# unit test
df1 = collapse_rows(testdata1.copy())
df2 = collapse_rows(testdata2.copy())
#display(df1)
#display(df2)

# Testing internal function:
result = omp.compare_collapsed_dfs(df1, df2, 'sourceCode', 'conceptId')
display(result)

# Testing function 
result = omp.compare_dfs(testdata1, testdata2)
display(result)

result = omp.compare_dfs(testdata1, testdata2, how='conceptName')
show(result)


In [None]:
import pandas as pd
from itables import init_notebook_mode, show

# Initialize itables for notebook mode
init_notebook_mode(all_interactive=True)

# Create a sample DataFrame
data = {
    'sourceCode': ['A0sdfasdfsagsfffffffffff1', 'A02', 'A03'],
    'conceptId': [[1001, 1002], [1003], [1004, 1005]],
    'value': [10, 20, 30]
}

df = pd.DataFrame(data)

# Define a function to limit characters in each column
def limit_characters(value, max_chars=40):
    if len(str(value))>max_chars:
        value = str(value)[:max_chars] + '...'
    return value

# Apply the character limit function to each column
df = df.map(limit_characters)

# Display the DataFrame with left-aligned columns and limited characters
show(df, columnDefs=[
    {"targets": "_all", "className": "dt-left", }
])



## Compare usagi files: set-based 

In [None]:
# Reload the module to reflect any changes
importlib.reload(omop_mapping_utils)
# Now you can use the updated function
import omop_mapping_utils as omp
from omop_mapping_utils import compare_dataframes

filepath = "testdata"
df1 = pd.read_csv(f"{filepath}/ICD10fi.usagi.csv")
df2 = pd.read_csv(f"{filepath}/ICD10fi.fixedEF.usagi.csv")

# Simple:
result = omp.compare_dfs(df1, df2)
display(result)

# Full info:
result = omp.compare_dfs(df1, df2, how='conceptName')
show(result.map(limit_characters), scrollY="400px", scrollCollapse=True, paging=False, column_filters="footer")

showresult = result.query("Comparison=='Different'")
limitcols = ['sourceName', 'conceptName']
showresult.loc[:,limitcols] = showresult.loc[:,limitcols].map(limit_characters)
show(showresult, scrollY="400px", scrollCollapse=True, paging=False, column_filters="footer")


## Compare usagi files: direct

In [None]:
filepath = "testdata"
df1 = pd.read_csv(f"{filepath}/ICD10fi.usagi.csv")
df2 = pd.read_csv(f"{filepath}/ICD10fi.fixedEF.usagi.csv")

compared_df = omp.compare_dataframes(df1, df2, merge_columns)

display(compared_df)

## Compare usagi files: manual

In [None]:
filepath = "testdata"
df1 = pd.read_csv(f"{filepath}/ICD10fi.usagi.csv")
df2 = pd.read_csv(f"{filepath}/ICD10fi.fixedEF.usagi.csv")

df1.set_index(['sourceCode', 'conceptId'], inplace=True)
df2.set_index(['sourceCode', 'conceptId'], inplace=True)

df_merge = pd.merge(df1,df2, how='outer', left_index=True, right_index=True)
display(df_merge)

# Development section

In [None]:
import importlib
import omop_mapping_utils as omp

# Reload the module to reflect any changes
importlib.reload(omop_mapping_utils)

# Now you can use the updated function
import omop_mapping_utils as omp
from omop_mapping_utils import compare_dataframes

merge_columns = ["sourceCode"]
df1 = testdata1
df2 = testdata2
compared_df = omp.compare_dataframes(df1, df2, merge_columns)

In [None]:
compared_df

# looking at A02.2+H22.0, we can see that this is a double mapping, and in the merge, all combinations are used. 
# That is not what we want => compare 
# - single mapping on sourceCode and mutiple mappings on source code, 
# - multiple mappings on sourceCode and conceptId

# TODO:
#  - test wether this works as expected:
#    1.1 mappings that are missing in either df are found and highlited
#    1.2 same for muliple mappings

In [None]:
# unit test

# first step is to filter tow df with singe and double mappings, respectively
importlib.reload(omop_mapping_utils)
import omop_mapping_utils
from omop_mapping_utils import split_single_and_multiple_occurrences

df1_single, df1_multiple = split_single_and_multiple_occurrences(testdata1, column_name="sourceCode")

display(df1_single)
display(df1_multiple)

# 2nd step is to compare each
merge_columns = ["sourceCode"]
# for sinle comparisons, remove the double sourcecodes fomr df1 in df2 (they will be covered in the mult comparision.  
testdata2_singlefiltered = omp.remove_common_rows(testdata2, df1_multiple, column_name='sourceCode')
# conversely, remove single sourcecodes for mult comparision:
testdata2_doublefiltered = omp.remove_common_rows(testdata2, df1_single, column_name='sourceCode')
#
compared_df_single = omp.compare_dataframes(df1_single, testdata2_singlefiltered, merge_columns)
compared_df_multiple = omp.compare_dataframes(df1_multiple, testdata2_doublefiltered, ["sourceCode", "conceptId"])

display(compared_df_single)
display(compared_df_multiple)

In [None]:
# first step is to filter tow df with singe and double mappings, respectively
importlib.reload(omop_mapping_utils)
from omop_mapping_utils import split_single_and_multiple_occurrences

df1_single, df1_multiple = split_single_and_multiple_occurrences(df1, column_name="sourceCode")

display(df1_single)
display(df1_multiple)

In [None]:
# next step: compare single vs new on sourceCode and mult vs new on sourceCode and conceptId 
# Question: DO we need to also treat sinlge and multiple in right df seperatley? Or do we get what we need like this already?
#  Answer: no because now we have a lot of right only from the right df. 

merge_columns = ["sourceCode"]
compared_df_single = compare_dataframes(df1_single, df2, merge_columns)
compared_df_multiple = compare_dataframes(df1_multiple, df2, ["sourceCode", "conceptId"])

display(compared_df_single)
display(compared_df_multiple)