## Simulation

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pyarrow.feather as feather

In [None]:
data_path = "C:/Users/gard_/Documents/ProjectThesis/MyPipeline/Data/Phenotypes/180k/"
SNP_matrix = pd.read_feather("C:/Users/gard_/Documents/ProjectThesis/MyPipeline/Data/SNP_180k.feather")
SNP_matrix = SNP_matrix.rename(columns={'IID': 'ringnr'})

# Remove unnecessary columns
columns_to_remove = ['FID', 'PAT', 'MAT', 'SEX', 'PHENOTYPE', 'ringnr']
M_tilde = SNP_matrix.drop(columns=columns_to_remove)

# Replace NA values with 0
M_tilde.fillna(0, inplace=True)

#Mean-center M
M_tilde = M_tilde - M_tilde.mean(axis=0)

In [None]:
SNP_matrix.shape

(3032, 182854)

: 

## Architecture Aspheim

In [5]:
def get_u_aspheim(p):
    """
    Generate effect sizes u. 'p' is number of columns (SNPs).
    Distributions given in handwritten notes.
    Returns (p x 1) vector of effect sizes u.
    """
    pi = np.random.rand(p)
    u = np.zeros(p)
    
    for i in range(p):
        if pi[i] < 0.002:
            u[i] = np.random.normal(0, np.sqrt(0.33 * 10**(-2)))
        elif pi[i] < 0.008:
            u[i] = np.random.normal(0, np.sqrt(0.33 * 10**(-3)))
        elif pi[i] < 0.04:
            u[i] = np.random.normal(0, np.sqrt(0.33 * 10**(-4)))
    
    return u

## Architecture Chiara

In [12]:
def get_u_chiara(p, arch):
    """
    Generate effect sizes u. 'p' is number of columns (SNPs).
    Distributions given in handwritten notes.
    Note that pi_1 and pi_2 are always 0 (see handwritten notes, architectures).
    Returns (p x 1) vector of effect sizes u.
    """
    pi = np.random.rand(p)
    u = np.zeros(p)

    # vector of architectures for pi_0 and pi_3
    arch_vec = np.array([0.99, 0.95, 0.9, 0.8, 0.6, 0.4, 0.2, 0.1, 0.05, 0.01])
    
    for i in range(p):
        if pi[i] < arch_vec[arch]:
            # pi_3:
            u[i] = np.random.normal(0, np.sqrt(0.33 * 10**(-2)))
    
    return u

Generate phenotypes and effect sizes for all $10$ architecture types in a for-loop.

In [13]:
# Get the number of columns (p)
p = M_tilde.shape[1]

for i in range(10):
    arch = i + 1
    # Generate u values
    u_sim = get_u_chiara(p = p, arch = arch-1)

    # Compute a as matrix multiplication
    a = np.dot(M_tilde.values, u_sim)

    # Scale a to achieve heritability of 0.33
    a = a * np.sqrt(0.33 / np.var(a))

    # Generate epsilon (error term) with variance 0.67
    epsilon = np.random.normal(0, np.sqrt(0.67), size=a.shape)

    # Simulated phenotype y
    y = a + epsilon

    # Heritability check (approximately 0.33)
    heritability = np.var(a) / np.var(y)
    print(f"Heritability: {heritability:.2f}")

    data_path = "C:/Users/gard_/Documents/MasterThesis/Code/Data/Phenotypes/180k/"
    # Create a DataFrame for the phenotype and save it
    y_df = pd.DataFrame({'Value': y})
    y_df['ringnr'] = SNP_matrix['ringnr']
    y_df.rename(columns={'Value': 'pheno'}, inplace=True)

    # Save SNP effects
    u_df = pd.DataFrame({'Value': u_sim})
    u_df['SNP'] = M_tilde.columns
    u_df.rename(columns={'Value': 'effect'}, inplace=True)


    # Save to a CSV file
    y_df.to_csv(data_path + "Sim_pheno_180k_arch_" + str(arch) + ".csv", index=False)

    u_df.to_csv(data_path + "Sim_effect_180k_arch_" + str(arch) + ".csv", index=False)


    # Display first few rows
    print(y_df.head())
    print(u_df.head())

Heritability: 0.33
      pheno   ringnr
0  0.285488  8118424
1 -1.040927  8118425
2  0.799771  8118426
3  1.306471  8118429
4  0.131240  8118430
     effect                SNP
0 -0.163251        SNPa29779_A
1  0.046591       SNPa117082_T
2 -0.034414       SNPa445460_A
3  0.015360       SNPa236639_T
4 -0.069876  SNP7008a2013145_T
Heritability: 0.34
      pheno   ringnr
0 -0.205576  8118424
1  0.056869  8118425
2 -0.235232  8118426
3 -0.032739  8118429
4 -1.118019  8118430
     effect                SNP
0 -0.034867        SNPa29779_A
1 -0.025444       SNPa117082_T
2 -0.028851       SNPa445460_A
3 -0.021038       SNPa236639_T
4  0.037395  SNP7008a2013145_T
Heritability: 0.32
      pheno   ringnr
0  2.039708  8118424
1  0.548068  8118425
2  1.918994  8118426
3 -0.349760  8118429
4  0.216051  8118430
     effect                SNP
0  0.008818        SNPa29779_A
1  0.088811       SNPa117082_T
2  0.004754       SNPa445460_A
3 -0.051152       SNPa236639_T
4  0.002789  SNP7008a2013145_T
Heritab