In [None]:
import pandas as pd
import missingno as msno
from pathlib import Path
import json
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
path_data = Path('/Datasets/BrainBlocks/DerivativeDatasets/HBN/brainblocks-0.1.0/phenotype/')

# Load data
variables_file = path_data / 'target_dictionary.txt'
df_dic = pd.read_csv(variables_file)
df_dic

In [None]:
# File paths
training_file = path_data / "phenotype_measures_separate_training.txt"
testing_file = path_data / "phenotype_measures_separate_testing.txt"
validation_file = path_data / "phenotype_measures_separate_validation.txt"

# Load files into pandas DataFrames
training_data = pd.read_csv(training_file, sep=',')
testing_data = pd.read_csv(testing_file, sep=',')
validation_data = pd.read_csv(validation_file, sep=',')

# 1: for females, 2 for males
print(f"train sex ratio: {training_data['sex_at_birth'].value_counts()}")
print(f"val sex ratio: {validation_data['sex_at_birth'].value_counts()}")
print(f"test sex ratio: {testing_data['sex_at_birth'].value_counts()}")

print(f"train age: {training_data['age_at_scan'].mean():.2f} +- {training_data['age_at_scan'].std():.2f}")
print(f"val age: {validation_data['age_at_scan'].mean():.2f} +- {validation_data['age_at_scan'].std():.2f}")
print(f"test age: {testing_data['age_at_scan'].mean():.2f} +- {testing_data['age_at_scan'].std():.2f}")

## After imputation (also invalit WIAT)

In [None]:
path_data = Path('/Datasets/BrainBlocks/DerivativeDatasets/HBN/brainblocks-0.1.0/phenotype-WIAT/')


In [None]:

# File paths
training_file = path_data / "phenotype_measures_imputation-imputed_numeric_training_zscored.txt"
testing_file = path_data / "phenotype_measures_imputation-imputed_numeric_testing_zscored.txt"
validation_file = path_data / "phenotype_measures_imputation-imputed_numeric_validation_zscored.txt"

# Load files into pandas DataFrames
training_data = pd.read_csv(training_file, sep=',')
testing_data = pd.read_csv(testing_file, sep=',')
validation_data = pd.read_csv(validation_file, sep=',')

# Combine DataFrames
combined_data = pd.concat([training_data, testing_data, validation_data], ignore_index=True)


In [None]:
combined_data

In [None]:
print(f"Data shape: {training_data.shape}, {testing_data.shape}, {validation_data.shape}")

In [None]:
pairwise_correlation = combined_data.corr()

# Set up the matplotlib figure
plt.figure(figsize=(12, 8))

# Create a seaborn heatmap
sns.heatmap(pairwise_correlation, cmap="RdBu_r", annot=False, fmt=".2f", linewidths=.5)

# Set plot title
plt.title("Pairwise Correlation Heatmap")


## After imputation (select subset of variables)

In [None]:
# File paths
path_data = Path('/Datasets/BrainBlocks/DerivativeDatasets/HBN/brainblocks-0.1.0/phenotype/')


In [None]:
# Check demographics of the data
# File paths
training_file = path_data / "phenotype_measures_separate_training.txt"
testing_file = path_data / "phenotype_measures_separate_testing.txt"
validation_file = path_data / "phenotype_measures_separate_validation.txt"

# Load files into pandas DataFrames
training_data = pd.read_csv(training_file, sep=',')
testing_data = pd.read_csv(testing_file, sep=',')
validation_data = pd.read_csv(validation_file, sep=',')

print(f"Training size: {training_data.shape}")
print(f"Training age: {training_data['age_at_scan'].mean():.2f} +- {training_data['age_at_scan'].std():.2f}")
print(f"Training sex: {training_data['sex_at_birth'].value_counts()}")

In [None]:
print(f"Testing size: {testing_data.shape}")
print(f"Testing age: {testing_data['age_at_scan'].mean():.2f} +- {testing_data['age_at_scan'].std():.2f}")
print(f"Testing sex: {testing_data['sex_at_birth'].value_counts()}")

In [None]:
print(f"Validation size: {validation_data.shape}")
print(f"Validation age: {validation_data['age_at_scan'].mean():.2f} +- {training_data['age_at_scan'].std():.2f}")
print(f"Validation sex: {validation_data['sex_at_birth'].value_counts()}")

In [None]:
training_file = path_data / "phenotype_measures_imputation-imputed_numeric_training_zscored.txt"
testing_file = path_data / "phenotype_measures_imputation-imputed_numeric_testing_zscored.txt"
validation_file = path_data / "phenotype_measures_imputation-imputed_numeric_validation_zscored.txt"

# Load files into pandas DataFrames
training_data = pd.read_csv(training_file, sep=',')
testing_data = pd.read_csv(testing_file, sep=',')
validation_data = pd.read_csv(validation_file, sep=',')

# Combine DataFrames
combined_data = pd.concat([training_data, testing_data, validation_data], ignore_index=True)


In [None]:
print(f"Data shape: {training_data.shape}, {testing_data.shape}, {validation_data.shape}")

In [None]:
pairwise_correlation = combined_data.corr()

# Set up the matplotlib figure
plt.figure(figsize=(12, 8))

# Create a seaborn heatmap
sns.heatmap(pairwise_correlation, cmap="RdBu_r", annot=False, fmt=".2f", linewidths=.5)

# Set plot title
plt.title("Pairwise Correlation Heatmap")
