## Imports

In [29]:
import pandas as pd
from pandas import DataFrame

import numpy as np

import ast
from typing import Literal

def get_HLA_match(donor_tissue_type, recipient_tissue_type):
    matches, missing_allell, missing_antigen = 0, 0, 0

    for donor_row, recipient_row in zip(donor_tissue_type, recipient_tissue_type):
        donor_row_sorted = sorted(donor_row)
        recipient_row_sorted = sorted(recipient_row)

        for donor_val, recipient_val in zip(donor_row_sorted, recipient_row_sorted):
            if donor_val == recipient_val:
                matches += 1
            else:
                if donor_val.split(':')[0] != recipient_val.split(":")[0]:
                    missing_allell += 1
                if donor_val.split(':')[1] != recipient_val.split(':')[1]:
                    missing_antigen += 1

    return matches, missing_allell, missing_antigen


def get_CMV_serostatus(donor_CMV, recipient_CMV):
    SEROSTATUS_MATRIX = np.array([[0, 1],
                                  [2, 3]])

    return SEROSTATUS_MATRIX[recipient_CMV, donor_CMV]


def get_gender_match(donor_gender, recipient_gender):
    if donor_gender == 0 and recipient_gender == 1:
        return 0
    
    return 1


def get_ABO_match(donor_ABO, recipient_ABO):
    BLOOD_COMPATIBILITY_MATRIX = np.array([[1, 0, 0, 0],
                                           [1, 1, 0, 0],
                                           [1, 0, 1, 0],
                                           [1, 1, 1, 1]])

    return BLOOD_COMPATIBILITY_MATRIX[recipient_ABO, donor_ABO]

## Read Latin Datasets

In [30]:
def read_df(df_path):
    df = pd.read_csv(df_path, sep=';', encoding="latin1")
    return df

## General Required Functions

In [31]:
def get_row_by_id(id: str, data: DataFrame, id_col_name: str):
    return data.loc[data[id_col_name] == id]

In [32]:
def join_row_to_data(row: DataFrame, data: DataFrame):
    data_joined = data.copy()

    for col in row.columns:
        data_joined[col] = row.iloc[0][col]

    return data_joined

## Encoding

In [None]:
ATTRIB_GROUPS = {
    "gender": ["donor_gender", "recipient_gender"],
    "blood_type": ["donor_ABO", "recipient_ABO"],
    "presence": ["donor_CMV", "recipient_CMV"],
    "match": ["ABO_match", "gender_match", "HLA_mismatch"],
    "donor_age_group": ["donor_age_group"],
    "yes_no": ["donor_age_below_35", "recipient_age_below_10", "tx_post_relapse"],
    "disease": ["disease"],
    "malignant": ["disease_group"],
    "level": ["risk_group"],
    # "stem_cell_source": ["stem_cell_source"],
}

VALUE_MAPPERS = {
    "gender": {"female": 0, "male": 1},
    "blood_type": {"O": 0, "A": 1, "B": 2, "AB": 3},
    "presence": {"absent": 0, "present": 1},
    "match": {"mismatched": 0, "matched": 1},
    "donor_age_group": {"18-35": 2, "35-50": 1, "50-60": 0},
    "yes_no": {"no": 0, "yes": 1},
    "disease": {"chronic": 1, "AML": 3, "ALL": 4, "nonmalignant": 0, "lymphoma": 2},
    "malignant": {"nonmalignant": 0, "malignant": 1},
    "level": {"low": 1, "high": 1},
    # "stem_cell_source": {"pheripheral blood": 0, "bone marrow": 1},
}

def encode_data(data: DataFrame, mode: Literal["encode", "decode"] = "encode"):
    data_encoded = data.copy()

    for mapper_name, attribs in ATTRIB_GROUPS.items():
        mapping = VALUE_MAPPERS[mapper_name]

        if mode == "decode":
            mapping = {value: key for key, value in mapping.items()}

        for attrib in attribs:
            if attrib not in data_encoded.columns:
                continue

            data_encoded[attrib] = data_encoded[attrib].map(mapping)

    return data_encoded

## Aggregate Data

In [34]:
def add_match_features(data_encoded: DataFrame):
    data_added = data_encoded.copy()

    def compute_HLA(row):
        donor = ast.literal_eval(row["donor_tissue_type"])
        recipient = ast.literal_eval(row["recipient_tissue_type"])
        return get_HLA_match(donor, recipient)
    
    results = data_added.apply(compute_HLA, axis=1)
    data_added[["HLA_match", "allel", "antigen"]] = DataFrame(results.tolist())

    data_added["CMV_serostatus"] = data_added.apply(lambda row: get_CMV_serostatus(row["donor_CMV"], row["recipient_CMV"]), axis=1)

    data_added["gender_match"] = data_added.apply(lambda row: get_gender_match(row["donor_gender"], row["recipient_gender"]), axis=1)

    data_added["ABO_match"] = data_added.apply(lambda row: get_ABO_match(row["donor_ABO"], row["recipient_ABO"]), axis=1)
    
    return data_added

In [None]:
def add_abstracted_features(data_encoded: DataFrame):
    data_added = data_encoded.copy()

    data_added["disease_group"] = (data_added["disease"] != "nonmalignant").astype("int")
    
    data_added["donor_age_below_35"] = (data_added["donor_age"] < 35).astype("int")

    data_added["recipient_age_below_10"] = (data_added["recipient_age"] < 10).astype("int")

    data_added["HLA_mismatch"] = (data_added["HLA_match"] > 8).astype("int")
    
    return data_added

In [36]:
def aggregate_data(recipient_id: str, recipient_waiting_list: DataFrame, donor_list: DataFrame):

    recipient_row = get_row_by_id(recipient_id, recipient_waiting_list, "recipient_id")

    data_aggregated = join_row_to_data(recipient_row, donor_list)

    data_aggregated["donor_age_group"] = pd.cut(
        data_aggregated["donor_age"],
        bins=[18, 35, 50, 60],
        labels=["18-35", "35-50", "50-60"]
    )

    data_aggregated = encode_data(data_aggregated)

    data_aggregated = add_match_features(data_aggregated)

    data_aggregated = add_abstracted_features(data_aggregated)

    data_aggregated = encode_data(data_aggregated, mode="decode")

    return data_aggregated

## Testing

In [None]:
# donors_CSV_PATH = "../../../datasets/raw/donor_list_raw.csv"
# recipient_CSV_PATH = "../../../datasets/raw/recipient_waiting_list_raw.csv"

# df_recipients = read_df(recipient_CSV_PATH)
# df_donors = read_df(donors_CSV_PATH)

# data_aggregated = aggregate_data("IR002", df_recipients, df_donors)
# data_aggregated

   donor_id  donor_age  donor_ABO  donor_CMV  donor_gender  \
0     ID001         32          1          0             1   
1     ID002         45          2          1             0   
2     ID003         28          0          0             1   
3     ID004         51          3          1             0   
4     ID005         39          1          0             0   
5     ID006         22          0          1             1   
6     ID007         48          2          0             1   
7     ID008         35          1          1             0   
8     ID009         41          0          0             1   
9     ID010         30          3          1             0   
10    ID011         56          1          0             1   
11    ID012         27          2          1             0   
12    ID013         44          0          0             0   
13    ID014         38          1          1             1   
14    ID015         52          3          0             0   

       

Unnamed: 0,donor_id,donor_age,donor_ABO,donor_CMV,donor_gender,donor_tissue_type,donor_name,recipient_id,recipient_age,recipient_ABO,...,HLA_match,allel,antigen,CMV_serostatus,gender_match,ABO_match,disease_group,donor_age_below_35,recipient_age_below_10,HLA_mismatch
0,ID001,32,A,absent,male,"[['A*01:01','A*02:01'],['B*08:01','B*35:01'],[...",Afonso Miguel Torres Lima,IR002,12.5,O,...,6,4,3,0,matched,mismatched,malignant,yes,no,mismatched
1,ID002,45,B,present,female,"[['A*24:02','A*03:01'],['B*07:02','B*44:02'],[...",Ana Margarida Lousada Pinto,IR002,12.5,O,...,7,1,2,1,mismatched,mismatched,malignant,no,no,mismatched
2,ID003,28,O,absent,male,"[['A*02:01','A*11:01'],['B*15:01','B*51:01'],[...",Gonï¿½alo Tiago Miguï¿½is Pï¿½voas,IR002,12.5,O,...,1,9,5,0,matched,matched,malignant,yes,no,mismatched
3,ID004,51,AB,present,female,"[['A*29:02','A*30:01'],['B*44:03','B*18:01'],[...",Maria Joana Calheiros Rocha,IR002,12.5,O,...,0,7,9,1,mismatched,mismatched,malignant,no,no,mismatched
4,ID005,39,A,absent,female,"[['A*01:01','A*26:01'],['B*57:01','B*38:01'],[...",Beatriz Inï¿½s Valenï¿½a Ribeiro,IR002,12.5,O,...,1,8,7,0,mismatched,mismatched,malignant,no,no,mismatched
5,ID006,22,O,present,male,"[['A*24:02','A*02:01'],['B*07:02','B*44:02'],[...",Daniel Rui Espinheira Barroso,IR002,12.5,O,...,10,0,0,1,matched,matched,malignant,yes,no,matched
6,ID007,48,B,absent,male,"[['A*03:01','A*68:01'],['B*35:01','B*53:01'],[...",Miguel Afonso Norberto Veiga,IR002,12.5,O,...,1,8,6,0,matched,mismatched,malignant,no,no,mismatched
7,ID008,35,A,present,female,"[['A*11:01','A*23:01'],['B*40:01','B*49:01'],[...",Rita Sofia Castanheira Lopes,IR002,12.5,O,...,0,7,8,1,mismatched,mismatched,malignant,no,no,mismatched
8,ID009,41,O,absent,male,"[['A*02:01','A*32:01'],['B*13:02','B*27:05'],[...",Luï¿½s Filipe Antï¿½o Barata,IR002,12.5,O,...,1,8,5,0,matched,matched,malignant,no,no,mismatched
9,ID010,30,AB,present,female,"[['A*01:01','A*29:02'],['B*08:01','B*44:02'],[...",Liliana Inï¿½s Gouveia Monteiro,IR002,12.5,O,...,2,8,5,1,mismatched,mismatched,malignant,yes,no,mismatched


In [38]:
data_survival_prediction = data_aggregated.drop(columns=["donor_id", "donor_gender", "donor_tissue_type", "donor_name",
                                                         "recipient_id", "recipient_tissue_type", "recipient_name"], axis=1).copy()
data_survival_prediction

Unnamed: 0,donor_age,donor_ABO,donor_CMV,recipient_age,recipient_ABO,recipient_CMV,recipient_gender,disease,tx_post_relapse,risk_group,...,HLA_match,allel,antigen,CMV_serostatus,gender_match,ABO_match,disease_group,donor_age_below_35,recipient_age_below_10,HLA_mismatch
0,32,A,absent,12.5,O,absent,male,AML,yes,high,...,6,4,3,0,matched,mismatched,malignant,yes,no,mismatched
1,45,B,present,12.5,O,absent,male,AML,yes,high,...,7,1,2,1,mismatched,mismatched,malignant,no,no,mismatched
2,28,O,absent,12.5,O,absent,male,AML,yes,high,...,1,9,5,0,matched,matched,malignant,yes,no,mismatched
3,51,AB,present,12.5,O,absent,male,AML,yes,high,...,0,7,9,1,mismatched,mismatched,malignant,no,no,mismatched
4,39,A,absent,12.5,O,absent,male,AML,yes,high,...,1,8,7,0,mismatched,mismatched,malignant,no,no,mismatched
5,22,O,present,12.5,O,absent,male,AML,yes,high,...,10,0,0,1,matched,matched,malignant,yes,no,matched
6,48,B,absent,12.5,O,absent,male,AML,yes,high,...,1,8,6,0,matched,mismatched,malignant,no,no,mismatched
7,35,A,present,12.5,O,absent,male,AML,yes,high,...,0,7,8,1,mismatched,mismatched,malignant,no,no,mismatched
8,41,O,absent,12.5,O,absent,male,AML,yes,high,...,1,8,5,0,matched,matched,malignant,no,no,mismatched
9,30,AB,present,12.5,O,absent,male,AML,yes,high,...,2,8,5,1,mismatched,mismatched,malignant,yes,no,mismatched


In [39]:
data_criteria = data_aggregated[["HLA_match", "donor_age_group", "CMV_serostatus", "gender_match", "ABO_match"]].copy()
encode_data(data_criteria)

Unnamed: 0,HLA_match,donor_age_group,CMV_serostatus,gender_match,ABO_match
0,6,2,0,1,0
1,7,1,1,0,0
2,1,2,0,1,1
3,0,0,1,0,0
4,1,1,0,0,0
5,10,2,1,1,1
6,1,1,0,1,0
7,0,2,1,0,0
8,1,1,0,1,1
9,2,2,1,0,0
