# Simulation Playground

This is an interactive environment to see how the data can be formualted and Phenotype Simulator Tested

$$ \textrm{Genetic Risk Score}_{p} = \Sigma^{N}_{i=1} \gamma_{ij}\beta_{i} +\epsilon_{p} \\
\textrm{p = patient index} \ \  \textrm{N = number of Causal Snps} \ \ \textrm{j = index of interacting snp partner} \\
\gamma=\textrm{Interaction Coefficient} \ \ \beta=\textrm{Effect Size} \ \ \epsilon=\textrm{Patient Specfic Bias}
$$


$$  \textrm{G} = \frac{\textrm{G} - \overline{G}}{\sigma_{G}} $$

$$ \textrm{G'}_{p} = h\textrm{G}_{k} + \kappa_{p}\sqrt{1-h^{2}}\mathcal{N}(0,1) \\ 
\textrm{h = Heritability Constnat} \ \ \kappa=\textrm{Noise Scaling Coefficient}
$$

In [1]:
import os, sys
from scripts.phenotype_simulator import PhenotypeSimulator
from scripts.genotype_simulator import GenotypeSimulator
from scripts.cmd_args import parse_args

In [2]:
import pandas as pd
import numpy as np

In [3]:
# Define you size parameters
num_snps = 1000
num_genes = 100
people = 100

In [6]:
feature_ids = np.random.randint(num_genes, size=num_snps)
positions = np.arange(10000,10000+num_snps)
allele_1 = [np.random.choice(['A', 'C']) for _ in range (num_snps)]
allele_2 = [np.random.choice(['T', 'G']) for _ in range (num_snps)]
genotype=[]
for i in range(people):
    person = [1 if x > 0.5 else 0 for x in np.random.random(num_snps)]
    genotype.append(person)
df = pd.DataFrame(genotype)
risk_allele = [1 if df.iloc[:,idx].value_counts().to_list()[0] > df.shape[0]//2 else 0 for idx in range(0, df.shape[1])]

In [7]:
snplist = pd.DataFrame(list(zip(feature_ids, positions,allele_1,allele_2,risk_allele)), columns = ['Feature ID', 'Position', 'Allele 1', 'Allele 2', 'Risk Allele'])
snplist

Unnamed: 0,Feature ID,Position,Allele 1,Allele 2,Risk Allele
0,17,10000,C,G,1
1,77,10001,C,G,1
2,41,10002,C,T,1
3,17,10003,A,G,1
4,78,10004,A,G,1
...,...,...,...,...,...
995,67,10995,C,T,0
996,84,10996,A,T,1
997,60,10997,A,G,1
998,32,10998,C,T,1


In [8]:
matrix = pd.DataFrame(np.array(genotype).T)
matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,1,1,1,1,1,1,0,0,1,0,...,0,0,1,0,1,1,0,1,1,1
1,0,1,1,0,1,1,1,0,1,0,...,0,0,1,0,1,0,0,0,1,0
2,1,1,0,0,0,1,0,1,0,1,...,0,1,1,0,0,0,1,0,0,0
3,1,1,1,0,0,0,0,1,1,0,...,0,0,1,1,1,0,0,1,1,1
4,0,1,0,1,0,1,0,1,1,0,...,0,0,1,1,0,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,0,1,0,1,0,0,...,0,1,1,1,0,0,1,1,0,1
996,1,0,0,0,0,0,1,0,1,0,...,1,0,0,1,0,1,1,0,0,0
997,0,0,1,0,0,1,0,1,1,0,...,1,0,1,1,1,0,1,1,0,0
998,1,0,0,0,1,1,1,0,0,1,...,0,0,1,0,0,0,1,0,0,1


In [9]:
df = pd.concat([snplist, matrix], axis = 1)
df

Unnamed: 0,Feature ID,Position,Allele 1,Allele 2,Risk Allele,0,1,2,3,4,...,90,91,92,93,94,95,96,97,98,99
0,17,10000,C,G,1,1,1,1,1,1,...,0,0,1,0,1,1,0,1,1,1
1,77,10001,C,G,1,0,1,1,0,1,...,0,0,1,0,1,0,0,0,1,0
2,41,10002,C,T,1,1,1,0,0,0,...,0,1,1,0,0,0,1,0,0,0
3,17,10003,A,G,1,1,1,1,0,0,...,0,0,1,1,1,0,0,1,1,1
4,78,10004,A,G,1,0,1,0,1,0,...,0,0,1,1,0,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,67,10995,C,T,0,0,0,0,0,0,...,0,1,1,1,0,0,1,1,0,1
996,84,10996,A,T,1,1,0,0,0,0,...,1,0,0,1,0,1,1,0,0,0
997,60,10997,A,G,1,0,0,1,0,0,...,1,0,1,1,1,0,1,1,0,0
998,32,10998,C,T,1,1,0,0,0,1,...,0,0,1,0,0,0,1,0,0,1


In [52]:
# save test data 
df.to_csv("chr{}/chr{}_genotype_{}_{}.csv".format(0, 0, "test", "exon"), sep=" ", index_label=False)   

In [None]:
!gwas-sim-public phenotype -dp ./ -chr 0 --data_identifier test --prefilter exon --phenotype_experiement_name playground_example

In [15]:
import pickle
causal_gene_name = "chr0_causal_genes_test_exon_playground_example.pkl" 
causal_snp_name = "chr0_causal_snp_idx_test_exon_playground_example.pkl" 
interactive_snp_name = "chr0_interactive_snps_test_exon_playground_example.pkl"
effect_size_name = "chr0_effect_size_test_exon_playground_example.pkl"
phenotype_name = "chr0_phenotype_test_exon_playground_example.pkl"
info_path = "./chr0/"
with open((info_path + causal_snp_name), 'rb') as f:
            causal_snps = pickle.load(f)
with open((info_path + interactive_snp_name), 'rb') as f:
            interactive_snps = pickle.load(f)
with open((info_path + causal_gene_name), 'rb') as f:
            causal_genes = pickle.load(f)
with open((info_path + effect_size_name), 'rb') as f:
            effect_sizes = pickle.load(f)
with open((info_path + phenotype_name), 'rb') as f:
            phenotype = pickle.load(f)

<img src="chr0/Chr 0 Interactive Coefficients.png">

<img src="chr0/Chr 0 Phenotype Scores with Heredity 1.0.png">

<img src="chr0/Chr 0 Phenotype Scores.png">

In [10]:
causal_snps

{599: 0.48049203946981534,
 742: 0.48049203946981534,
 473: 0.48049203946981534,
 423: 0.48049203946981534,
 660: 0.48049203946981534,
 992: 0.48049203946981534,
 828: 0.48049203946981534,
 185: 0.48049203946981534,
 498: 0.48049203946981534,
 640: 0.48049203946981534,
 64: 0.48049203946981534,
 772: 0.48049203946981534,
 0: 1.4807009876107247,
 716: 4.325512806526925,
 949: 4.325512806526925,
 669: 4.325512806526925,
 584: 4.325512806526925,
 129: 4.325512806526925,
 59: 3.2358707009377774,
 306: 3.2358707009377774,
 248: 3.2358707009377774,
 1: 3.2358707009377774,
 851: 3.2358707009377774,
 96: 3.2358707009377774,
 771: 1.8490404637417002}

In [11]:
interactive_snps

{771: [900, 1.471787793615466, 0],
 1: [971, 1.1818534510671383, 0],
 851: [87, 0.3525632802800296, 1],
 772: [832, 0.05147555116623059, 0],
 96: [688, 0.7769801927373692, 1]}

In [12]:
causal_genes

{26: 0.48049203946981534,
 86: 1.4807009876107247,
 2: 4.325512806526925,
 55: 3.2358707009377774,
 75: 1.8490404637417002}

In [13]:
effect_sizes

{96: [0, 0.7459750563627328, 1.725386494605671],
 584: [0, 1.7080121857232047, 2.676664370928459],
 669: [0, 2.033562805386352, 2.0540455193050007],
 129: [0, 1.2457087775379214, 3.097389555257261],
 306: [2.4242167260688277, 1.2408341789069965, 0],
 742: [0, 0.049425025005325286, 0.42208928448246175],
 185: [0, 0.044384918540826254, 0.1701166027321305],
 992: [0, 0.01615658975261703, 0.26514331976354366],
 64: [0, 0.1542366191717364, 0.46557841475818135],
 59: [0, 0.4571117310497689, 0.7159774401970219],
 771: [1.8195339223133833, 0.17983754064037497, 0],
 640: [0, 0.12509174325393557, 0.2580350465370709],
 0: [0, -1.3574989735412462, -1.3574989735412462],
 1: [0, -0.3857937732836941, -0.3857937732836941],
 772: [-0.21443268043701547, -0.21443268043701547, 0],
 423: [0, -1.0298652253442064, -1.0298652253442064],
 716: [0.043418490270838175, 0.043418490270838175, 0],
 498: [-0.2682215092438742, -0.2682215092438742, 0],
 851: [0, 1.5561485069260097, 1.5561485069260097],
 660: [0, -0.275

In [14]:
len(phenotype)

100

In [35]:
df.head()

Unnamed: 0,Feature ID,Position,Allele 1,Allele 2,Risk Allele,0,1,2,3,4,...,90,91,92,93,94,95,96,97,98,99
0,"[1, 17]",10000,C,G,1,1,1,1,1,1,...,0,0,1,0,1,1,0,1,1,1
1,77,10001,C,G,1,0,1,1,0,1,...,0,0,1,0,1,0,0,0,1,0
2,41,10002,C,T,1,1,1,0,0,0,...,0,1,1,0,0,0,1,0,0,0
3,17,10003,A,G,1,1,1,1,0,0,...,0,0,1,1,1,0,0,1,1,1
4,78,10004,A,G,1,0,1,0,1,0,...,0,0,1,1,0,1,1,1,1,0
