# Distance Matrices
Pre-calculate all of the distance matrices for the different datasets. Then drop in replacements.

In [1]:
import os
import json
import numpy as np
import gower
from datasets import load_from_disk
import tqdm
from sentence_transformers import SentenceTransformer

import sys
sys.path.insert(0, "../../src/")
sys.path.insert(0, "../..")
from config import REPO_ROOT

In [5]:
# MAD
def MAD(data: list) -> float:
    """
    Compute the Median Absolute Deviation (MAD) of a list of numbers.
    """
    med = np.nanmedian(data)
    deviations_list = [np.abs((x-med)) for x in data]
    return float(np.nanmedian(deviations_list))

# std
def std(data: list) -> float:
    return np.nanstd(data)

# L1
def l1(x_1: float, x_2: float) -> float:
    """
    Scalar L1 distance
    """
    return np.abs(x_1-x_2)

# L2_squared
def l2_squared(x_1: float, x_2: float) -> float:
    """
    Scalar L1 distance
    """
    return (x_1-x_2)**2

# Gower

In [6]:
# 1. Define your datasets and the columns you want for each
datasets = ['income', 'house_prices', 'heart_disease']
columns  = [
    ['AGEP', 'SCHL'],
    ['area', 'bedrooms', 'bathrooms', 'floors'],
    ['AGEP', 'SEX', 'systolic_bp', 'total_cholesterol']
]

# 2. Load the datasets metadata once
with open(REPO_ROOT / 'src/models_datasets/datasets.json', 'r') as f:
    datasets_dict = json.load(f)

# 3. Loop through each (name, cols) pair
for d_name, cols in zip(datasets, columns):
    ds_info = datasets_dict[d_name]
    ds  = load_from_disk(ds_info['filepath'])
    
    # 4. Pull out just the columns you need and convert to pandas
    df = ds.select_columns(cols).to_pandas()
    
    # 5. Convert dtype: everything but SEX → float64, SEX → object
    num_cols = [c for c in cols if c != 'SEX']
    df[num_cols] = df[num_cols].astype('float64')
    if 'SEX' in cols:
        df['SEX'] = df['SEX'].astype('object')
    
    # 6. Compute Gower on exactly these columns
    print(f"calculating Gower matrix for {d_name}…")
    gower_mat = gower.gower_matrix(df)
    print("Done.")
    
    # 7. Ensure the output directory exists, then save
    out_dir = REPO_ROOT / f'src/distance_matrices/{d_name}'
    os.makedirs(out_dir, exist_ok=True)
    out_path = os.path.join(out_dir, f'{d_name}_gower.npy')
    np.save(out_path, gower_mat)
    print(f"Saved to {out_path}\n")

calculating Gower matrix for income…
Done.
Saved to REPO_ROOT/src/distance_matrices/income/income_gower.npy

calculating Gower matrix for house_prices…
Done.
Saved to REPO_ROOT/src/distance_matrices/house_prices/house_prices_gower.npy

calculating Gower matrix for heart_disease…
Done.
Saved to REPO_ROOT/src/distance_matrices/heart_disease/heart_disease_gower.npy



# L1 weighed by MAD

In [7]:
#### FOLKTEXTS

datasets = ['income']
columns  = [['AGEP', 'SCHL']]

with open(REPO_ROOT / 'src/models_datasets/datasets.json', 'r') as f:
    datasets_dict = json.load(f)

# 3. Loop through each (name, cols) pair
for d_name, cols in zip(datasets, columns):
    ds_info = datasets_dict[d_name]
    print(d_name, cols)
    ds  = load_from_disk(ds_info['filepath'])

    # 4. Pull out just the columns you need and convert to pandas
    df = ds.select_columns(cols).to_pandas()

    # get MADs
    MADs = {}
    for col in cols:
        MADs.update({col:MAD(df[col])})

    # calculate matrix in a really terrible way

    # init
    l1_MAD = np.zeros((len(df), len(df)))

    for index_1, row_1 in tqdm.tqdm(df.iterrows()):
        for index_2, row_2 in df.iterrows():

            age_dist = l1(row_1['AGEP'], row_2['AGEP'])/MADs['AGEP']
            sch_dist = l1(row_1['SCHL'], row_2['SCHL'])/MADs['SCHL']
            l1_MAD[index_1, index_2] = age_dist+sch_dist # sum across all features

    # distance matriz
    l1_MAD

    # 7. Ensure the output directory exists, then save
    out_dir = REPO_ROOT / f'src/distance_matrices/{d_name}'
    os.makedirs(out_dir, exist_ok=True)
    out_path = os.path.join(out_dir, f'{d_name}_l1.npy')
    np.save(out_path, l1_MAD)
    print(f"Saved to {out_path}\n")

income ['AGEP', 'SCHL']


1920it [01:01, 31.17it/s]

Saved to REPO_ROOT/src/distance_matrices/income/income_l1.npy






In [8]:
#### house prices

datasets = ['house_prices']
columns  = [['area', 'bedrooms', 'bathrooms', 'floors']]

with open(REPO_ROOT / 'src/models_datasets/datasets.json', 'r') as f:
    datasets_dict = json.load(f)

# 3. Loop through each (name, cols) pair
for d_name, cols in zip(datasets, columns):
    ds_info = datasets_dict[d_name]
    print(d_name, cols)
    ds  = load_from_disk(ds_info['filepath'])

    # 4. Pull out just the columns you need and convert to pandas
    df = ds.select_columns(cols).to_pandas()

    # get MADs
    MADs = {}
    for col in cols:
        MADs.update({col:MAD(df[col])})

    # calculate matrix in a really terrible way

    # init
    l1_MAD = np.zeros((len(df), len(df)))

    for index_1, row_1 in tqdm.tqdm(df.iterrows()):
        for index_2, row_2 in df.iterrows():

            dist_1 = l1(row_1['area'], row_2['area'])/MADs['area']
            dist_2 = l1(row_1['bedrooms'], row_2['bedrooms'])/MADs['bedrooms']
            dist_3 = l1(row_1['bathrooms'], row_2['bathrooms'])/MADs['bathrooms']
            dist_4 = l1(row_1['floors'], row_2['floors'])/MADs['floors']

            l1_MAD[index_1, index_2] = dist_1+dist_2+dist_3+dist_4 # sum across all features

    # distance matriz
    l1_MAD

    # 7. Ensure the output directory exists, then save
    out_dir = REPO_ROOT / f'src/distance_matrices/{d_name}'
    os.makedirs(out_dir, exist_ok=True)
    out_path = os.path.join(out_dir, f'{d_name}_l1.npy')
    np.save(out_path, l1_MAD)
    print(f"Saved to {out_path}\n")

house_prices ['area', 'bedrooms', 'bathrooms', 'floors']


39it [00:01, 26.13it/s]

1600it [01:01, 25.88it/s]

Saved to REPO_ROOT/src/distance_matrices/house_prices/house_prices_l1.npy






In [9]:
#### heart disease

datasets = ['heart_disease']
columns  = [['AGEP', 'SEX', 'systolic_bp', 'total_cholesterol']]

with open(REPO_ROOT / 'src/models_datasets/datasets.json', 'r') as f:
    datasets_dict = json.load(f)

# 3. Loop through each (name, cols) pair
for d_name, cols in zip(datasets, columns):
    ds_info = datasets_dict[d_name]
    print(d_name, cols)
    ds  = load_from_disk(ds_info['filepath'])

    # 4. Pull out just the columns you need and convert to pandas
    df = ds.select_columns(cols).to_pandas()

    # get MADs
    MADs = {}
    for col in cols:
        MADs.update({col:MAD(df[col])})

    # calculate matrix in a really terrible way

    # init
    l1_MAD = np.zeros((len(df), len(df)))

    for index_1, row_1 in tqdm.tqdm(df.iterrows()):
        for index_2, row_2 in df.iterrows():

            dist_1 = l1(row_1['AGEP'], row_2['AGEP'])/MADs['AGEP']
            dist_2 = l1(row_1['SEX'], row_2['SEX'])/MADs['SEX']
            dist_3 = l1(row_1['systolic_bp'], row_2['systolic_bp'])/MADs['systolic_bp']
            dist_4 = l1(row_1['total_cholesterol'], row_2['total_cholesterol'])/MADs['total_cholesterol']

            l1_MAD[index_1, index_2] = dist_1+dist_2+dist_3+dist_4 # sum across all features

    # distance matriz
    l1_MAD

    # 7. Ensure the output directory exists, then save
    out_dir = REPO_ROOT / f'src/distance_matrices/{d_name}'
    os.makedirs(out_dir, exist_ok=True)
    out_path = os.path.join(out_dir, f'{d_name}_l1.npy')
    np.save(out_path, l1_MAD)
    print(f"Saved to {out_path}\n")

heart_disease ['AGEP', 'SEX', 'systolic_bp', 'total_cholesterol']


0it [00:00, ?it/s]

1936it [01:29, 21.54it/s]


Saved to REPO_ROOT/src/distance_matrices/heart_disease/heart_disease_l1.npy



# L2 squared weighted by standard deviations

In [10]:
#### FOLKTEXTS

datasets = ['income']
columns  = [['AGEP', 'SCHL']]

with open(REPO_ROOT / 'src/models_datasets/datasets.json', 'r') as f:
    datasets_dict = json.load(f)

# 3. Loop through each (name, cols) pair
for d_name, cols in zip(datasets, columns):
    ds_info = datasets_dict[d_name]
    print(d_name, cols)
    ds  = load_from_disk(ds_info['filepath'])

    # 4. Pull out just the columns you need and convert to pandas
    df = ds.select_columns(cols).to_pandas()

    # get MADs
    STDs = {}
    for col in cols:
        STDs.update({col:std(df[col])})

    # calculate matrix in a really terrible way

    # init
    l2_STD = np.zeros((len(df), len(df)))

    for index_1, row_1 in tqdm.tqdm(df.iterrows()):
        for index_2, row_2 in df.iterrows():

            age_dist = l2_squared(row_1['AGEP'], row_2['AGEP'])/STDs['AGEP']
            sch_dist = l2_squared(row_1['SCHL'], row_2['SCHL'])/STDs['SCHL']

            l2_STD[index_1, index_2] = age_dist+sch_dist # sum across all features

    # distance matriz
    l2_STD

    # 7. Ensure the output directory exists, then save
    out_dir = REPO_ROOT / f'src/distance_matrices/{d_name}'
    os.makedirs(out_dir, exist_ok=True)
    out_path = os.path.join(out_dir, f'{d_name}_l2.npy')
    np.save(out_path, l2_STD)
    print(f"Saved to {out_path}\n")

income ['AGEP', 'SCHL']


0it [00:00, ?it/s]

1920it [00:47, 40.15it/s]

Saved to REPO_ROOT/src/distance_matrices/income/income_l2.npy






In [11]:
#### house prices

datasets = ['house_prices']
columns  = [['area', 'bedrooms', 'bathrooms', 'floors']]

with open(REPO_ROOT / 'src/models_datasets/datasets.json', 'r') as f:
    datasets_dict = json.load(f)

# 3. Loop through each (name, cols) pair
for d_name, cols in zip(datasets, columns):
    ds_info = datasets_dict[d_name]
    print(d_name, cols)
    ds  = load_from_disk(ds_info['filepath'])

    # 4. Pull out just the columns you need and convert to pandas
    df = ds.select_columns(cols).to_pandas()

    # get MADs
    STDs = {}
    for col in cols:
        STDs.update({col:std(df[col])})

    # calculate matrix in a really terrible way

    # init
    l2_STD = np.zeros((len(df), len(df)))

    for index_1, row_1 in tqdm.tqdm(df.iterrows()):
        for index_2, row_2 in df.iterrows():

            dist_1 = l2_squared(row_1['area'], row_2['area'])/STDs['area']
            dist_2 = l2_squared(row_1['bedrooms'], row_2['bedrooms'])/STDs['bedrooms']
            dist_3 = l2_squared(row_1['bathrooms'], row_2['bathrooms'])/STDs['bathrooms']
            dist_4 = l2_squared(row_1['floors'], row_2['floors'])/STDs['floors']

            l2_STD[index_1, index_2] = dist_1+dist_2+dist_3+dist_4 # sum across all features

    # distance matriz
    l2_STD

    # 7. Ensure the output directory exists, then save
    out_dir = REPO_ROOT / f'src/distance_matrices/{d_name}'
    os.makedirs(out_dir, exist_ok=True)
    out_path = os.path.join(out_dir, f'{d_name}_l2.npy')
    np.save(out_path, l2_STD)
    print(f"Saved to {out_path}\n")

house_prices ['area', 'bedrooms', 'bathrooms', 'floors']


1600it [00:44, 36.22it/s]

Saved to REPO_ROOT/src/distance_matrices/house_prices/house_prices_l2.npy






In [12]:
#### heart disease

datasets = ['heart_disease']
columns  = [['AGEP', 'SEX', 'systolic_bp', 'total_cholesterol']]

with open(REPO_ROOT / 'src/models_datasets/datasets.json', 'r') as f:
    datasets_dict = json.load(f)

# 3. Loop through each (name, cols) pair
for d_name, cols in zip(datasets, columns):
    ds_info = datasets_dict[d_name]
    print(d_name, cols)
    ds  = load_from_disk(ds_info['filepath'])

    # 4. Pull out just the columns you need and convert to pandas
    df = ds.select_columns(cols).to_pandas()

    # get MADs
    STDs = {}
    for col in cols:
        STDs.update({col:std(df[col])})

    # calculate matrix in a really terrible way

    # init
    l2_STD = np.zeros((len(df), len(df)))

    for index_1, row_1 in tqdm.tqdm(df.iterrows()):
        for index_2, row_2 in df.iterrows():

            dist_1 = l2_squared(row_1['AGEP'], row_2['AGEP'])/STDs['AGEP']
            dist_2 = l2_squared(row_1['SEX'], row_2['SEX'])/STDs['SEX']
            dist_3 = l2_squared(row_1['systolic_bp'], row_2['systolic_bp'])/STDs['systolic_bp']
            dist_4 = l2_squared(row_1['total_cholesterol'], row_2['total_cholesterol'])/STDs['total_cholesterol']

            l2_STD[index_1, index_2] = dist_1+dist_2+dist_3+dist_4 # sum across all features

    # distance matriz
    l2_STD

    # 7. Ensure the output directory exists, then save
    out_dir = REPO_ROOT / f'src/distance_matrices/{d_name}'
    os.makedirs(out_dir, exist_ok=True)
    out_path = os.path.join(out_dir, f'{d_name}_l2.npy')
    np.save(out_path, l2_STD)
    print(f"Saved to {out_path}\n")

heart_disease ['AGEP', 'SEX', 'systolic_bp', 'total_cholesterol']


1936it [01:02, 30.74it/s]

Saved to REPO_ROOT/src/distance_matrices/heart_disease/heart_disease_l2.npy






# Semantic distance

In [56]:
# init model
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

education = {
        'N/A - no schooling completed': 1,
        'Nursery school / preschool': 2,
        'Kindergarten': 3,
        '1st grade only': 4,
        '2nd grade': 5,
        '3rd grade': 6,
        '4th grade': 7,
        '5th grade': 8,
        '6th grade': 9,
        '7th grade': 10,
        '8th grade': 11,
        '9th grade': 12,
        '10th grade': 13,
        '11th grade': 14,
        '12th grade, no diploma': 15,
        'Regular high school diploma': 16,
        'GED or alternative credential': 17,
        'Some college, less than 1 year': 18,
        'Some college, 1 or more years, no degree': 19,
        "Associate's degree": 20,
        "Bachelor's degree": 21,
        "Master's degree": 22,
        "Professional degree beyond a bachelor's degree": 23,
        'Doctorate degree': 24
    }

num2edu = {v:k for k,v in education.items()}

def cos_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [57]:
datasets = ['income']
columns  = [['AGEP', 'SCHL']]

with open(REPO_ROOT / 'src/models_datasets/datasets.json', 'r') as f:
    datasets_dict = json.load(f)

# 3. Loop through each (name, cols) pair
for d_name, cols in zip(datasets, columns):
    ds_info = datasets_dict[d_name]
    print(d_name, cols)
    ds  = load_from_disk(ds_info['filepath'])

    # 4. Pull out just the columns you need and convert to pandas
    df = ds.select_columns(cols).to_pandas()

    prompts = []
    for index_1, row_1 in tqdm.tqdm(df.iterrows()):

        prompt = (f"""You will be provided data corresponding to a survey respondent. The survey was conducted among US residents in 2018.

The respondent data is:
The age is: {row_1['AGEP']} years old.
The highest educational attainment is: {num2edu[row_1['SCHL']]}.
""")

        prompts.append(prompt)

    # append prompts
    df['prompts'] = prompts

    # get embeddings
    embeddings = model.encode(prompts) #1920, 768
    #df['embeddings'] = embeddings

    # init
    semantic = np.zeros((len(df), len(df)))

    for index_1, row_1 in tqdm.tqdm(df.iterrows()):
        for index_2, row_2 in df.iterrows():
            semantic[index_1, index_2] = 1 - cos_sim(embeddings[index_1], embeddings[index_2]) # 1 - cossim

    # 7. Ensure the output directory exists, then save
    out_dir = REPO_ROOT / f'src/distance_matrices/{d_name}'
    os.makedirs(out_dir, exist_ok=True)
    out_path = os.path.join(out_dir, f'{d_name}_semantic.npy')
    np.save(out_path, semantic)
    print(f"Saved to {out_path}\n")

income ['AGEP', 'SCHL']


1920it [00:00, 90447.28it/s]


1920it [01:05, 29.36it/s]

Saved to REPO_ROOT/src/distance_matrices/income/income_semantic.npy






In [58]:
datasets = ['house_prices']
columns  = [['area', 'bedrooms', 'bathrooms', 'floors']]

with open(REPO_ROOT / 'src/models_datasets/datasets.json', 'r') as f:
    datasets_dict = json.load(f)

# 3. Loop through each (name, cols) pair
for d_name, cols in zip(datasets, columns):
    ds_info = datasets_dict[d_name]
    print(d_name, cols)
    ds  = load_from_disk(ds_info['filepath'])

    # 4. Pull out just the columns you need and convert to pandas
    df = ds.select_columns(cols).to_pandas()

    prompts = []
    for index_1, row_1 in tqdm.tqdm(df.iterrows()):

        prompt = (f"""You will be provided with data about a house. The data was collected in 2015 from across the United States.

The house data is:
The size of the house (sq ft) is: {row_1['area']}.
The number of bedrooms is: {row_1['bedrooms']}.
The number of bathrooms is: {row_1['bathrooms']}.
The number of floors is: {row_1['floors']}.
""")

        prompts.append(prompt)

    # append prompts
    df['prompts'] = prompts

    # get embeddings
    embeddings = model.encode(prompts) 

    # init
    semantic = np.zeros((len(df), len(df)))

    for index_1, row_1 in tqdm.tqdm(df.iterrows()):
        for index_2, row_2 in df.iterrows():
            semantic[index_1, index_2] = 1 - cos_sim(embeddings[index_1], embeddings[index_2]) # 1 - cossim

    # 7. Ensure the output directory exists, then save
    out_dir = REPO_ROOT / f'src/distance_matrices/{d_name}'
    os.makedirs(out_dir, exist_ok=True)
    out_path = os.path.join(out_dir, f'{d_name}_semantic.npy')
    np.save(out_path, semantic)
    print(f"Saved to {out_path}\n")

house_prices ['area', 'bedrooms', 'bathrooms', 'floors']


1600it [00:00, 75030.59it/s]


1600it [00:45, 34.82it/s]

Saved to REPO_ROOT/src/distance_matrices/house_prices/house_prices_semantic.npy






In [59]:
sex_d = {1:"Male", 2:"Female"}

datasets = ['heart_disease']
columns  = [['AGEP', 'SEX', 'systolic_bp', 'total_cholesterol']]

with open(REPO_ROOT / 'src/models_datasets/datasets.json', 'r') as f:
    datasets_dict = json.load(f)

# 3. Loop through each (name, cols) pair
for d_name, cols in zip(datasets, columns):
    ds_info = datasets_dict[d_name]
    print(d_name, cols)
    ds  = load_from_disk(ds_info['filepath'])

    # 4. Pull out just the columns you need and convert to pandas
    df = ds.select_columns(cols).to_pandas()

    prompts = []
    for index_1, row_1 in tqdm.tqdm(df.iterrows()):

        prompt = (f"""You will be provided with clinical data for an individual. The data was collected in 2015 from across the United States.

The patient data is:
The age is: {row_1['AGEP']} years.
The sex is: {sex_d[row_1['SEX']]}.
The systolic blood pressure (mmHg) is: {row_1['systolic_bp']}.
The total cholesterol (mg/dL) is: {row_1['total_cholesterol']}.
""")
        prompts.append(prompt)

    # append prompts
    df['prompts'] = prompts

    # get embeddings
    embeddings = model.encode(prompts) 

    # init
    semantic = np.zeros((len(df), len(df)))

    for index_1, row_1 in tqdm.tqdm(df.iterrows()):
        for index_2, row_2 in df.iterrows():
            semantic[index_1, index_2] = 1 - cos_sim(embeddings[index_1], embeddings[index_2]) # 1 - cossim

    # 7. Ensure the output directory exists, then save
    out_dir = REPO_ROOT / f'src/distance_matrices/{d_name}'
    os.makedirs(out_dir, exist_ok=True)
    out_path = os.path.join(out_dir, f'{d_name}_semantic.npy')
    np.save(out_path, semantic)
    print(f"Saved to {out_path}\n")

heart_disease ['AGEP', 'SEX', 'systolic_bp', 'total_cholesterol']


1936it [00:00, 75560.39it/s]


1936it [01:05, 29.73it/s]

Saved to REPO_ROOT/src/distance_matrices/heart_disease/heart_disease_semantic.npy






: 