In [99]:
import numpy as np
import pandas as pd
import random
import shap
from math import sqrt
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from scipy.stats import sem, t
from scipy import mean

import warnings
warnings.filterwarnings('ignore')

import pickle
from random import randrange


In [100]:
#Generate base phenotype values
def trait_simulation_twoloci_inter(samples_n,loci_m,var_g,var_e, n_causal_SNPs,b12_event):
    #create allele frequencies
    f_M = np.random.uniform(0,1,loci_m)
    #create G matrix bases on allele frequencies
    G = np.random.binomial(n=2,p = f_M, size = (samples_n,loci_m))
    #scale or not depending on input, default is True
    G = preprocessing.scale(G, axis=0)
    #rows are the loci so each person has a row of different loci
    individuals = len(G)    
    sigma_e = sqrt(var_e)
    sigma_b = sqrt(var_g/n_causal_SNPs)
    #b_i = loci effect on phenotype
    b_1 = np.random.normal(0, sigma_b)
    b_2 = np.random.normal(0, sigma_b)
    loci =random.sample(range(0, loci_m), 2)
    SNP1 = G[:,loci[0]]
    SNP2 = G[:,loci[1]]
    individuals = len(SNP1)    
    #rows are the loci so each person has a row of different loci
    Y_n = np.zeros((individuals, 1));
    
    #depending on b1_event b12 will be different
    # if 0 then b12 has no effect
    if (b12_event == 0):
        b_12 = 0
    #if 1 then Random Combined Effect
    elif(b12_event == 1):
        b_12 = np.random.normal(0, sigma_b)
    #if 2 then: 0 < b_12 < b1
    elif(b12_event == 2):
        b_12 = random.uniform(0, abs(b_1))
    #if 3 then: 0 < b_12 < b2
    elif(b12_event == 3):
        b_12 = random.uniform(0, abs(b_2))
    #if 4 then: b_1 + b_2 < b_12
    elif(b12_event == 4):
        b_12 = random.uniform(abs(b_1) + abs(b_2), 1)
    #if 5 then: b_12 < 0
    elif(b12_event == 5):
        b_12 = random.uniform(-1 * sigma_b, 0)
             
    #create phenotype vector
    for k in range(0, individuals):
        #each individual will have a random e_j(noise) value
        e_j = np.random.normal(0, sigma_e)
        #G_ij will be the jth individual from our SNP for the loci of choice
        G_ij1  = SNP1[k]
        G_ij2  = SNP2[k]
        Y_j = (b_1*G_ij1) + (b_2*G_ij2) + (b_12 * (G_ij1 * G_ij2))+ e_j 
        Y_n[k] = Y_j 
    #add Y traits to G matrix
    G = np.append(G, Y_n, axis=1)
    return G, loci

In [101]:
def shap_NN_train(G):
    X = G[:,0:len(G[0])-2]
    y = G[:,len(G[0])-1]
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    model = Sequential()
    model.add(Dense(32, activation='relu', input_dim = len(X[0])))
    model.add(Dropout(0.25))
    model.add(Dense(1, activation='relu', input_dim = 32))
    model.add(Dropout(0.25))
    model.compile(loss='mean_absolute_error', optimizer='rmsprop')
    model.fit(x_train, y_train, epochs=50, batch_size=16, verbose = 0)
    explainer = shap.DeepExplainer(model, x_train)
    shap_values = explainer.shap_values(x_test)
    return shap_values

In [102]:
def mean_shap_values(shap_values1):
    shap_values1 = np.array(shap_values1)
    shap_values2 = np.zeros((len(shap_values1[0]),len(shap_values1[0][0])))
    shap_values2[:][:] = shap_values1[0,:,:]
#     shap_values2 = np.array(shap_values2)
    avg_shap = []
    for i in range(0,len(shap_values2[0])):
        shap2 = np.mean(abs(shap_values2[:,i]))
        avg_shap.append(shap2)
    temp1 = np.asarray(avg_shap)
    indices = temp1.argsort()[-2:][::-1]
    loci1_avgshap,loci2_avgshap = avg_shap[indices[0]],avg_shap[indices[1]]
    return indices

In [103]:
def accuracy_count(actual_loci,predicted_loci):
    if (predicted_loci[0] ==actual_loci[0] and predicted_loci[1] ==actual_loci[1]):
        return 1
    elif (predicted_loci[0] != actual_loci[0] and predicted_loci[1] == actual_loci[1]):
        return .5
    elif(predicted_loci[0] == actual_loci[0] and predicted_loci[1] != actual_loci[1]):
        return .5
    else:
        return 0
        

In [104]:
def percents_varg(zeros,halfs,ones):
    total = len(zeros)+len(halfs)+len(ones)
    perc_0 = len(zeros)/total
    perc_0 = perc_0*100
    perc_half = len(halfs)/total
    perc_half =perc_half*100
    perc_1 = len(ones)/total
    perc_1 = perc_1*100
    list_returned = []
    list_returned.append(perc_0)
    list_returned.append(perc_half)
    list_returned.append(perc_1)
    return list_returned

In [105]:
var_g_percents_lists[5]

In [109]:
var_g_list =[0,.1,.2,.3,.4,.5,.6,.7,.8,.9]
e_list = [0,.1,.2,.3,.4,.5,.6,.7,.8,.9]
[float(m) for m in e_list]# #this is a list of list
var_g_percents_lists = []
acc_list_0 = []
b12_event =2
n_causal_SNPs =2
for i in var_g_list:
    num_one = []
    num_zero = []
    num_half = []
    for j in range(0,70):
        var_e = e_list[randrange(10)]
        G,loci = trait_simulation_twoloci_inter(1000,20,i,var_e, n_causal_SNPs,b12_event)
        shap_values2 = shap_NN_train(G)
        predicted_loci = mean_shap_values(shap_values2)
        num_correct = accuracy_count(loci,predicted_loci)
        if(num_correct == 1):
            num_one.append(num_correct)
        if(num_correct == .5):
            num_half.append(num_correct)
        if(num_correct == 0):
            num_zero.append(num_correct)
    var_g_percents_lists.append(percents_varg(num_zero,num_half,num_one))
    

In [107]:
var_g_percents_lists[9]

In [110]:
with open('/Users/kevin/Downloads/NN_two_loci_bar_b12_2', 'wb') as fp:
    pickle.dump(var_g_percents_lists, fp)