Mediated schema attributes defined based on our compentency questions.

Dataset columns are is for the Farmingham study dataset, after dropping some irrelevent columns that are not used in mediated

In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import jaccard_score

# Input mediated schema and dataset columns as lists
mediated_schema = [
    "Patient_identfication_number", "First_Last_Names", "Age", "Patient_gender", "diabetes",
    "heart_disease_status", "Systolic_BP", "Diaolistic_BP",
    "bmi", "cholesterol", "smoker", "medications"
]
dataset_columns = ['gender', 'age', 'currentSmoker', 'BPMeds', 'diabetes',
                   'totChol', 'sysBP', 'diaBP', 'BMI', 'glucose',
                   'TenYearCHD', 'patient_id', 'name']


Matchers:

First Matcher algorithm implemented is Edit Distence, in here you can find the algorithm implementation and execution of algorithm against **mediated_schema** and **dataset_columns**.

Output is a matrix of all similarity ratio between each attrbute from mediated schema with each column from data set:
Output file: **farmingham_edit_distance_matrix**

In [4]:
# 1. Normalized Edit Distance Matcher
def editDistance(s1, s2):
    m = len(s1)
    n = len(s2)

    # Create a table to store results of subproblems
    dp = [[0] * (n + 1) for _ in range(m + 1)]

    # Fill the known entries in dp[][]
    # If one string is empty, then answer
    # is length of the other string
    for i in range(m + 1):
        dp[i][0] = i
    for j in range(n + 1):
        dp[0][j] = j

    # Fill the rest of dp[][]
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if s1[i - 1] == s2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                dp[i][j] = 1 + min(dp[i][j - 1], dp[i - 1][j], dp[i - 1][j - 1])

    edit_dist = dp[m][n]
    max_len = max(m, n)
    return 1 - edit_dist / max_len if max_len != 0 else 0

farmingham_edit_distance_matrix = pd.DataFrame(
    [[round(editDistance(m, d), 3) for d in dataset_columns]
     for m in mediated_schema],
    index=mediated_schema, columns=dataset_columns
)

farmingham_edit_distance_matrix.to_csv("farmingham_edit_distance_matrix.csv")
farmingham_edit_distance_matrix


Unnamed: 0,gender,age,currentSmoker,BPMeds,diabetes,totChol,sysBP,diaBP,BMI,glucose,TenYearCHD,patient_id,name
Patient_identfication_number,0.179,0.071,0.214,0.071,0.143,0.107,0.0,0.107,0.0,0.107,0.143,0.321,0.143
First_Last_Names,0.062,0.125,0.188,0.062,0.25,0.125,0.125,0.125,0.0,0.125,0.062,0.188,0.188
Age,0.167,0.667,0.077,0.167,0.125,0.0,0.0,0.0,0.0,0.143,0.1,0.1,0.25
Patient_gender,0.429,0.214,0.357,0.143,0.214,0.143,0.0,0.071,0.0,0.071,0.143,0.571,0.143
diabetes,0.125,0.25,0.231,0.25,1.0,0.0,0.0,0.375,0.0,0.125,0.1,0.2,0.25
heart_disease_status,0.15,0.1,0.1,0.1,0.3,0.1,0.1,0.15,0.0,0.1,0.15,0.25,0.1
Systolic_BP,0.0,0.0,0.0,0.0,0.0,0.182,0.364,0.273,0.0,0.182,0.0,0.182,0.0
Diaolistic_BP,0.0,0.077,0.0,0.077,0.231,0.154,0.231,0.308,0.0,0.154,0.0,0.231,0.077
bmi,0.0,0.0,0.077,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.25
cholesterol,0.273,0.091,0.308,0.091,0.273,0.364,0.091,0.0,0.0,0.182,0.091,0.182,0.091


Second String Matcher is **Jaccard matcher**.

Implementation is done using MultiLabelBinarizer, output result is similar to EditDistence output: **farmingham_jaccard_matrix**

In [5]:
# 2. Jaccard Similarity using sklearn.metrics.jaccard_score

mlb = MultiLabelBinarizer()

#testResults mediated schema
schema_sets = [set(term.lower()) for term in mediated_schema]
dataset_sets = [set(term.lower()) for term in dataset_columns]
all_sets = schema_sets + dataset_sets
binary_matrix = mlb.fit_transform(all_sets)
schema_bin = binary_matrix[:len(mediated_schema)]
dataset_bin = binary_matrix[len(mediated_schema):]

farmingham_jaccard_matrix = pd.DataFrame(index=mediated_schema, columns=dataset_columns)
for i, s_bin in enumerate(schema_bin):
    for j, d_bin in enumerate(dataset_bin):
        sim = jaccard_score(s_bin, d_bin)
        farmingham_jaccard_matrix.iloc[i, j] = round(sim, 3)

farmingham_jaccard_matrix = farmingham_jaccard_matrix.astype(float)
farmingham_jaccard_matrix.to_csv("farmingham_jaccard_matrix.csv")
farmingham_jaccard_matrix




Unnamed: 0,gender,age,currentSmoker,BPMeds,diabetes,totChol,sysBP,diaBP,BMI,glucose,TenYearCHD,patient_id,name
Patient_identfication_number,0.25,0.125,0.471,0.312,0.375,0.176,0.118,0.333,0.2,0.222,0.412,0.533,0.267
First_Last_Names,0.231,0.167,0.4,0.214,0.385,0.143,0.071,0.143,0.167,0.2,0.333,0.462,0.364
Age,0.333,1.0,0.083,0.125,0.25,0.0,0.0,0.143,0.0,0.25,0.2,0.222,0.4
Patient_gender,0.5,0.3,0.25,0.231,0.417,0.071,0.077,0.364,0.083,0.133,0.462,0.8,0.273
diabetes,0.2,0.25,0.214,0.444,1.0,0.091,0.222,0.5,0.25,0.167,0.333,0.5,0.222
heart_disease_status,0.25,0.182,0.333,0.231,0.545,0.154,0.077,0.25,0.083,0.214,0.462,0.5,0.167
Systolic_BP,0.0,0.0,0.25,0.231,0.308,0.364,0.4,0.25,0.182,0.308,0.188,0.286,0.0
Diaolistic_BP,0.067,0.077,0.235,0.308,0.5,0.333,0.25,0.455,0.167,0.286,0.25,0.462,0.071
bmi,0.0,0.0,0.083,0.286,0.25,0.0,0.167,0.333,1.0,0.0,0.0,0.1,0.167
cholesterol,0.182,0.1,0.5,0.167,0.25,0.625,0.091,0.0,0.0,0.5,0.417,0.143,0.091


Third matcher is the Semantic matcher:

In here we used a pre-trained transformer model: **all-MiniLM-L12-v2**.

**compute_similarity_matrix** method is the entry point to execute the algorithm, simply we compute the embeded for mediated attributes and source attributes.

**source_data, and schema_expectations** are used to help the model execute the schema matching technique.

Similarity output matrix is **farmingham_semantic_matrix**

In [10]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Configuration parameters
MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2"
NAME_WEIGHT = 0.5
VALUE_WEIGHT = 0.3
EXPECTATION_WEIGHT = 0.2
MAX_SAMPLE_VALUES = 10

# Load the transformer model
model = SentenceTransformer(MODEL_NAME)

def embed_text(text: str) -> np.ndarray:
    return model.encode([str(text)])[0]

def embed_text_list(texts: list[str]) -> np.ndarray:
    if not texts:
        return np.zeros(model.get_sentence_embedding_dimension())
    vectors = model.encode([str(t) for t in texts])
    return np.mean(vectors, axis=0)

def compute_mediated_embedding(col_name: str, expectations: list[str]) -> np.ndarray:
    name_vec = embed_text(col_name)
    if expectations:
        exp_vec = embed_text_list(expectations)
        return NAME_WEIGHT * name_vec + EXPECTATION_WEIGHT * exp_vec
    return name_vec

def compute_source_embedding(col_name: str, values: list[str]) -> np.ndarray:
    name_vec = embed_text(col_name)
    sampled_values = values[:MAX_SAMPLE_VALUES]
    if sampled_values:
        val_vec = embed_text_list(sampled_values)
        return NAME_WEIGHT * name_vec + VALUE_WEIGHT * val_vec
    return name_vec

def compute_similarity_matrix(mediated_schema: list[str], schema_expectations: dict, source_df: pd.DataFrame) -> pd.DataFrame:
    mediated_vectors = {
        col: compute_mediated_embedding(col, schema_expectations.get(col, []))
        for col in mediated_schema
    }

    source_vectors = {
        col: compute_source_embedding(col, source_df[col].astype(str).tolist())
        for col in source_df.columns
    }

    matrix = pd.DataFrame(index=mediated_schema, columns=source_df.columns, dtype=float)
    for m_col, m_vec in mediated_vectors.items():
        for s_col, s_vec in source_vectors.items():
            matrix.loc[m_col, s_col] = cosine_similarity([m_vec], [s_vec])[0][0]
    return matrix



schema_expectations = {
    "Patient_identfication_number": ["508", "819", "453"],
    "First_Last_Names": ["AMANDA MUNOZ", "XAVIER JORDAN", "E. Lucero"],
    "Age": ["39", "46", "48"],
    "Patient_gender": ["male", "female", "1", "0"],
    "diabetes": ["1", "0", "yes", "no"],
    "heart_disease_status": ["0", "1", "low risk", "high risk"],
    "Systolic_BP": ["106.0", "121.0", "127.5"],
    "Diaolistic_BP": ["70.0", "81.0", "80.0"],
    "bmi": ["26.97", "28.73", "25.34"],
    "cholesterol": ["195.0", "250.0", "245.0"],
    "smoker": ["1", "0", "yes", "no"],
    "medications": ["0.0", "yes", "no"]
}

source_data = {
    "gender": [1, 0, 1],
    "age": [39, 46, 48],
    "currentSmoker": [0, 0, 1],
    "BPMeds": [0.0, 0.0, 0.0],
    "diabetes": [0, 0, 0],
    "totChol": [195.0, 250.0, 245.0],
    "sysBP": [106.0, 121.0, 127.5],
    "diaBP": [70.0, 81.0, 80.0],
    "BMI": [26.97, 28.73, 25.34],
    "glucose": [77.0, 76.0, 70.0],
    "TenYearCHD": [0, 0, 0],
    "patient_id": [508, 819, 453],
    "name": ["AMANDA MUNOZ", "XAVIER JORDAN", "E. Lucero"]
}

df = pd.DataFrame(source_data)

farmingham_semantic_matrix = compute_similarity_matrix(mediated_schema, schema_expectations, df)
farmingham_semantic_matrix.to_csv("farmingham_semantic_matrix.csv", index=True)
farmingham_semantic_matrix


Unnamed: 0,gender,age,currentSmoker,BPMeds,diabetes,totChol,sysBP,diaBP,BMI,glucose,TenYearCHD,patient_id,name
Patient_identfication_number,0.2937,0.323283,0.201935,0.380004,0.335273,0.211915,0.228585,0.195674,0.169312,0.214316,0.279965,0.856643,0.203313
First_Last_Names,0.321851,0.271553,0.267801,0.163706,0.194045,0.232194,0.176443,0.204684,0.123555,0.193372,0.211044,0.205408,0.622842
Age,0.567044,0.997145,0.286869,0.213169,0.297588,0.278754,0.221892,0.23244,0.304382,0.230556,0.544098,0.295737,0.299638
Patient_gender,0.739448,0.390051,0.272486,0.41461,0.406021,0.242119,0.274248,0.30883,0.241145,0.249307,0.383348,0.629182,0.314139
diabetes,0.353506,0.256239,0.293,0.33841,0.937296,0.273776,0.201728,0.397017,0.274681,0.629036,0.282105,0.368332,0.306229
heart_disease_status,0.354105,0.316758,0.338494,0.548794,0.514491,0.244141,0.32179,0.303735,0.252082,0.336563,0.387647,0.476151,0.251447
Systolic_BP,0.26039,0.246789,0.272478,0.56289,0.278856,0.330726,0.623232,0.417802,0.31678,0.310174,0.243548,0.401487,0.21597
Diaolistic_BP,0.343422,0.286804,0.277745,0.599332,0.445768,0.356174,0.437103,0.70158,0.378902,0.492297,0.33037,0.387826,0.299585
bmi,0.326074,0.288077,0.158575,0.306423,0.282873,0.267266,0.277023,0.31418,0.99154,0.367485,0.17833,0.150258,0.20485
cholesterol,0.277949,0.259808,0.266197,0.354302,0.457293,0.352194,0.241726,0.256168,0.344053,0.467869,0.2309,0.294218,0.174661


In [11]:
farmingham_jaccard_matrix.to_latex("farmingham_jaccard_matrix.tex", index=True, float_format="%.3f")
farmingham_edit_distance_matrix.to_latex("farmingham_edit_distance_matrix.tex", index=True, float_format="%.3f")
farmingham_semantic_matrix.to_latex("farmingham_semantic_matrix.tex", index=True, float_format="%.3f")

In next Cell we will implement two combiner algorithm to combine the result of previous 3 thresholds to produce a new similarity matrix as of the three.

In [12]:
import pandas as pd
import numpy as np

def combine_maximum(*matrices: pd.DataFrame) -> pd.DataFrame:
    """
    Combine multiple similarity matrices by taking the maximum value for each cell.
    """
    stacked = np.stack([m.values for m in matrices], axis=0)
    combined = np.max(stacked, axis=0)
    return pd.DataFrame(combined, index=matrices[0].index, columns=matrices[0].columns)

def combine_weighted_average(matrices: list[pd.DataFrame], weights: list[float]) -> pd.DataFrame:
    """
    Combine multiple similarity matrices using a weighted average.
    Weights must match the number of matrices and sum to 1.
    """
    if len(matrices) != len(weights):
        raise ValueError("Number of weights must match number of matrices")

    stacked = np.stack([m.values for m in matrices], axis=0)
    weighted = np.tensordot(weights, stacked, axes=([0], [0]))

    return pd.DataFrame(weighted, index=matrices[0].index, columns=matrices[0].columns)



# Combine using maximum
max_combined = combine_maximum(farmingham_edit_distance_matrix, farmingham_jaccard_matrix, farmingham_semantic_matrix)
print("Maximum Combined Matrix:")


# Placeholder weights for now — we'll update these later
weights = [0.2, 0.2, 0.6]  # Must sum to 1
weighted_combined = combine_weighted_average([farmingham_edit_distance_matrix, farmingham_jaccard_matrix, farmingham_semantic_matrix], weights)
print("\nWeighted Average Combined Matrix:")


# Save results
max_combined.to_csv("combined_maximum_matrix.csv")
weighted_combined.to_csv("combined_weighted_average_matrix.csv")


Maximum Combined Matrix:

Weighted Average Combined Matrix:


In [13]:
max_combined.to_latex("max_combined.tex", index=True, float_format="%.3f")
weighted_combined.to_latex("weighted_combined.tex", index=True, float_format="%.3f")


```
Combiner strategy : Weighted Average
Simalrity Matrix:
```




In [20]:
weighted_combined

Unnamed: 0,gender,age,currentSmoker,BPMeds,diabetes,totChol,sysBP,diaBP,BMI,glucose,TenYearCHD,patient_id,name
Patient_identfication_number,0.26202,0.23317,0.258161,0.304602,0.304764,0.183749,0.160751,0.205404,0.141587,0.194389,0.278979,0.684786,0.203988
First_Last_Names,0.251711,0.221332,0.27828,0.153424,0.243427,0.192916,0.145066,0.17641,0.107533,0.181023,0.205626,0.253245,0.484105
Age,0.440226,0.931687,0.204122,0.186301,0.253553,0.167252,0.133135,0.168064,0.182629,0.216934,0.386459,0.241842,0.309783
Patient_gender,0.629469,0.336831,0.284892,0.323566,0.369812,0.188071,0.179949,0.272298,0.161287,0.190384,0.351009,0.651709,0.271683
diabetes,0.277103,0.253744,0.2648,0.341846,0.962377,0.182466,0.165437,0.41321,0.214809,0.435822,0.255863,0.360999,0.278137
heart_disease_status,0.292463,0.246455,0.289697,0.395476,0.477695,0.197284,0.228474,0.262241,0.167849,0.264738,0.354988,0.435691,0.204268
Systolic_BP,0.156234,0.148073,0.213487,0.383934,0.228914,0.307635,0.526739,0.355281,0.226468,0.284105,0.183729,0.334492,0.129582
Diaolistic_BP,0.219453,0.202882,0.213647,0.436599,0.413661,0.311104,0.358462,0.573548,0.260741,0.383378,0.248222,0.371296,0.209351
bmi,0.195645,0.172846,0.127145,0.241054,0.244724,0.16036,0.199614,0.255108,0.794924,0.220491,0.106998,0.130155,0.20631
cholesterol,0.257769,0.194085,0.321318,0.264181,0.378976,0.409116,0.181436,0.153701,0.206432,0.417121,0.24014,0.241531,0.141197





```
Combiner strategy : Maximum
Simalrity Matrix:
```




In [21]:
max_combined

Unnamed: 0,gender,age,currentSmoker,BPMeds,diabetes,totChol,sysBP,diaBP,BMI,glucose,TenYearCHD,patient_id,name
Patient_identfication_number,0.2937,0.323283,0.471,0.380004,0.375,0.211915,0.228585,0.333,0.2,0.222,0.412,0.856643,0.267
First_Last_Names,0.321851,0.271553,0.4,0.214,0.385,0.232194,0.176443,0.204684,0.167,0.2,0.333,0.462,0.622842
Age,0.567044,1.0,0.286869,0.213169,0.297588,0.278754,0.221892,0.23244,0.304382,0.25,0.544098,0.295737,0.4
Patient_gender,0.739448,0.390051,0.357,0.41461,0.417,0.242119,0.274248,0.364,0.241145,0.249307,0.462,0.8,0.314139
diabetes,0.353506,0.256239,0.293,0.444,1.0,0.273776,0.222,0.5,0.274681,0.629036,0.333,0.5,0.306229
heart_disease_status,0.354105,0.316758,0.338494,0.548794,0.545,0.244141,0.32179,0.303735,0.252082,0.336563,0.462,0.5,0.251447
Systolic_BP,0.26039,0.246789,0.272478,0.56289,0.308,0.364,0.623232,0.417802,0.31678,0.310174,0.243548,0.401487,0.21597
Diaolistic_BP,0.343422,0.286804,0.277745,0.599332,0.5,0.356174,0.437103,0.70158,0.378902,0.492297,0.33037,0.462,0.299585
bmi,0.326074,0.288077,0.158575,0.306423,0.282873,0.267266,0.277023,0.333,1.0,0.367485,0.17833,0.150258,0.25
cholesterol,0.277949,0.259808,0.5,0.354302,0.457293,0.625,0.241726,0.256168,0.344053,0.5,0.417,0.294218,0.174661




```
Ground truth Matrix to use it as reference to compute TP TN FN.

```

```
compute_quality_metrics is used to compute the quality measures

```



In [15]:

ground_truth = pd.DataFrame([
    # gender  age  currentSmoker  BPMeds  diabetes  totChol  sysBP  diaBP  BMI  glucose  TenYearCHD  patient_id  name
    [   0,     0,       0,           0,       0,        0,      0,     0,    0,    0,        0,           1,         0],  # Patient_identfication_number
    [   0,     0,       0,           0,       0,        0,      0,     0,    0,    0,        0,           0,         1],  # First_Last_Names
    [   0,     1,       0,           0,       0,        0,      0,     0,    0,    0,        0,           0,         0],  # Age
    [   1,     0,       0,           0,       0,        0,      0,     0,    0,    0,        0,           0,         0],  # Patient_gender
    [   0,     0,       0,           0,       1,        0,      0,     0,    0,    0,        0,           0,         0],  # diabetes
    [   0,     0,       0,           0,       0,        0,      0,     0,    0,    0,        1,           0,         0],  # heart_disease_status
    [   0,     0,       0,           0,       0,        0,      1,     0,    0,    0,        0,           0,         0],  # Systolic_BP
    [   0,     0,       0,           0,       0,        0,      0,     1,    0,    0,        0,           0,         0],  # Diaolistic_BP
    [   0,     0,       0,           0,       0,        0,      0,     0,    1,    0,        0,           0,         0],  # bmi
    [   0,     0,       0,           0,       0,        1,      0,     0,    0,    0,        0,           0,         0],  # cholesterol
    [   0,     0,       1,           0,       0,        0,      0,     0,    0,    0,        0,           0,         0],  # smoker
    [   0,     0,       0,           1,       0,        0,      0,     0,    0,    0,        0,           0,         0],  # medications
], index=[
    "Patient_identfication_number", "First_Last_Names", "Age", "Patient_gender", "diabetes",
    "heart_disease_status", "Systolic_BP", "Diaolistic_BP", "bmi", "cholesterol", "smoker", "medications"
], columns=[
    'gender', 'age', 'currentSmoker', 'BPMeds', 'diabetes',
    'totChol', 'sysBP', 'diaBP', 'BMI', 'glucose',
    'TenYearCHD', 'patient_id', 'name'
])


def compute_quality_metrics(predicted: pd.DataFrame, ground_truth: pd.DataFrame) -> None:
    """Calculate TP, FP, FN, precision, recall, and F1."""
    if predicted.shape != ground_truth.shape:
        raise ValueError("Matrices must be the same shape")

    TP = np.sum((predicted.values == 1) & (ground_truth.values == 1))
    FP = np.sum((predicted.values == 1) & (ground_truth.values == 0))
    FN = np.sum((predicted.values == 0) & (ground_truth.values == 1))

    precision = TP / (TP + FP) if (TP + FP) else 0
    recall = TP / (TP + FN) if (TP + FN) else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0

    print(f"TP: {TP}")
    print(f"FP: {FP}")
    print(f"FN: {FN}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")



In [16]:
def one_to_one_selector(similarity_matrix: pd.DataFrame) -> pd.DataFrame:
    """
    For each row, select the column with the maximum similarity.
    Ensures one-to-one matching by keeping the highest unmatched score across the matrix.
    """
    selected = pd.DataFrame(0, index=similarity_matrix.index, columns=similarity_matrix.columns)
    used_columns = set()

    # Flatten and sort all entries by score descending
    sorted_entries = sorted(
        [(row, col, similarity_matrix.loc[row, col]) for row in similarity_matrix.index for col in similarity_matrix.columns],
        key=lambda x: x[2],
        reverse=True
    )

    for row, col, _ in sorted_entries:
        if row not in selected.index:
            continue
        if selected.loc[row].sum() > 0:
            continue
        if col in used_columns:
            continue
        selected.loc[row, col] = 1
        used_columns.add(col)

    return selected

# similarity_matrix = pd.read_csv("your_combined_similarity_matrix.csv", index_col=0)

max_one_to_one_selected = one_to_one_selector(max_combined)
weighted_one_to_one_selected = one_to_one_selector(weighted_combined)

# Save results
max_one_to_one_selected.to_csv("max_one_to_one_selected.csv")
weighted_one_to_one_selected.to_csv("weighted_one_to_one_selected.csv")

# Evaluate using your ground truth
print("Results for max combinors with one to one selector:")
compute_quality_metrics(max_one_to_one_selected, ground_truth)

print("\n\n\n")

print("Results for weighted average combinors with one to one selector:")
compute_quality_metrics(weighted_one_to_one_selected, ground_truth)

print("\n\n\n")

Results for max combinors with one to one selector:
TP: 10
FP: 2
FN: 2
Precision: 0.8333
Recall: 0.8333
F1 Score: 0.8333




Results for weighted average combinors with one to one selector:
TP: 9
FP: 3
FN: 3
Precision: 0.7500
Recall: 0.7500
F1 Score: 0.7500






In [24]:
weighted_one_to_one_selected

Unnamed: 0,gender,age,currentSmoker,BPMeds,diabetes,totChol,sysBP,diaBP,BMI,glucose,TenYearCHD,patient_id,name
Patient_identfication_number,0,0,0,0,0,0,0,0,0,0,0,1,0
First_Last_Names,0,0,0,0,0,0,0,0,0,0,0,0,1
Age,0,1,0,0,0,0,0,0,0,0,0,0,0
Patient_gender,1,0,0,0,0,0,0,0,0,0,0,0,0
diabetes,0,0,0,0,1,0,0,0,0,0,0,0,0
heart_disease_status,0,0,0,1,0,0,0,0,0,0,0,0,0
Systolic_BP,0,0,0,0,0,0,1,0,0,0,0,0,0
Diaolistic_BP,0,0,0,0,0,0,0,1,0,0,0,0,0
bmi,0,0,0,0,0,0,0,0,1,0,0,0,0
cholesterol,0,0,0,0,0,0,0,0,0,1,0,0,0


In [25]:
max_one_to_one_selected

Unnamed: 0,gender,age,currentSmoker,BPMeds,diabetes,totChol,sysBP,diaBP,BMI,glucose,TenYearCHD,patient_id,name
Patient_identfication_number,0,0,0,0,0,0,0,0,0,0,0,1,0
First_Last_Names,0,0,0,0,0,0,0,0,0,0,0,0,1
Age,0,1,0,0,0,0,0,0,0,0,0,0,0
Patient_gender,1,0,0,0,0,0,0,0,0,0,0,0,0
diabetes,0,0,0,0,1,0,0,0,0,0,0,0,0
heart_disease_status,0,0,0,1,0,0,0,0,0,0,0,0,0
Systolic_BP,0,0,0,0,0,0,1,0,0,0,0,0,0
Diaolistic_BP,0,0,0,0,0,0,0,1,0,0,0,0,0
bmi,0,0,0,0,0,0,0,0,1,0,0,0,0
cholesterol,0,0,0,0,0,1,0,0,0,0,0,0,0


In [17]:
weighted_one_to_one_selected.to_latex("weighted_one_to_one_selected.tex", index=True, float_format="%.3f")
max_one_to_one_selected.to_latex("max_one_to_one_selected.tex", index=True, float_format="%.3f")
ground_truth.to_latex("ground_truth.tex", index=True, float_format="%.3f")