In [None]:
# !pip install tensorflow
import numpy as np
import pandas as pd
import random
import shap
from math import sqrt
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn.linear_model import LinearRegression
import warnings
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from random import randrange
from sklearn import linear_model
import pickle
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV


In [None]:
# create the phenotype matrix and return the Genotype matrix with the phenotype added.

In [None]:
def trait_simulation_oneloci(samples_n, loci_m,var_g,var_e):
    f_M = np.random.uniform(0,1,loci_m)
    G = np.random.binomial(n=2,p = f_M, size = (samples_n,loci_m))
    G = preprocessing.scale(G, axis=0)
    loci =random.randint(0,loci_m-1)
    SNP = G[:,loci]
    individuals = len(SNP)
    mean = 0 
    sigma_b = sqrt(var_g)
    sigma_e = sqrt(var_e)
    b_i = np.random.normal(0, sigma_b)
    Y_n = np.zeros((individuals, 1))
    for k in range(0, individuals):
        #each individual will have a random e_j(noise) value
        e_j = np.random.normal(0, sigma_e)
        #G_ij will be the jth individual from our SNP for the loci of choce
        G_ij  = SNP[k]
        Y_j = b_i*G_ij + e_j
        Y_n[k] = Y_j 
    H= var_g/(var_g+var_e)
    G = np.append(G, Y_n, axis=1)
    return G,samples_n,loci_m

In [None]:
# this will run a grid search for various var g and var e

In [None]:
def training_RFR_simulation(samples,loci,iterations):
    var_g_list = [0,0.2,0.4,0.6,0.8,1]

    var_e_list = [0,0.2,0.4,0.6,0.8,1]
    best_params_list = []
    for i in range(0, iterations):
        var_g = random.choice(var_g_list)

        var_e = random.choice(var_e_list)
        G_oneloci, samples_n,loci_m  = trait_simulation_oneloci(samples,loci,var_g,var_e)
        #split data into test and train
        X = G_oneloci[:,0:len(G_oneloci[0])-1]

        #last column is the appended Y vector we predicted
        y = G_oneloci[:,len(G_oneloci[0])-1]

        #split the data, 70% training
        x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7)
        # Number of trees in random forest
        n_estimators = [int(x) for x in np.linspace(start = 10, stop = 2000, num = 40)]
        # Number of features to consider at every split
        max_features = ['auto', 'sqrt']
        # Maximum number of levels in tree
        max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
        max_depth.append(None)
        # Minimum number of samples required to split a node
        min_samples_split = [2, 5, 1]
        # Minimum number of samples required at each leaf node
        min_samples_leaf = [1, 2, 4]
        # Method of selecting samples for training each tree
        bootstrap = [True, False]
        # Create the random grid
        random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
        # Use the random grid to search for best hyperparameters
        # First create the base model to tune
        rf = RandomForestRegressor()
        # Random search of parameters, using 3 fold cross validation, 
        # search across 100 different combinations, and use all available cores
        rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 10, verbose=0, random_state=42, n_jobs = -1)
        # Fit the random search model
        rf_random.fit(x_train, y_train)
        best_params_list.append(rf_random.best_params_)
    return best_params_list

In [None]:
best_list = training_RFR_simulation(1000,20,10)

In [None]:
best_list

In [None]:
G_oneloci, samples_n,loci_m = trait_simulation_oneloci(1000,20,.5,.5)

In [None]:
#split data into test and train
X = G_oneloci[:,0:len(G_oneloci[0])-1]

#last column is the appended Y vector we predicted
y = G_oneloci[:,len(G_oneloci[0])-1]

#split the data, 70% training
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7)

In [None]:
forReg = RandomForestRegressor(max_depth=10, random_state=0, n_estimators=50)
forReg.fit(x_train, y_train)

In [None]:
from keras.losses import mean_squared_error

def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(mean_squared_error(y_true, y_pred))

In [None]:
#model for NN grid search
def create_model(optimizer='adam',activation='relu'):
    model = Sequential()
    model.add(Dense( 256, activation=activation, input_dim = 20))
    model.add(Dropout(0.25))
    model.add(Dense(1, activation=activation, input_dim = 256))
    model.add(Dropout(0.25))
    model.compile(loss='mean_absolute_error', optimizer=optimizer)
    return model
    # model.fit(x_train, y_train, epochs=50, batch_size=16, verbose = 0)

In [None]:
model = KerasClassifier(build_fn=create_model, verbose=0)


In [None]:
batch_size = [10, 20, 40, 60, 80, 100]
epochs = [10, 25, 50]
# neurons = [10, 32, 64, 128, 256]
optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'adam', 'Adamax', 'Nadam']
activation = ['softmax', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear']
param_grid = dict(epochs=epochs,batch_size=batch_size,optimizer=optimizer,activation=activation)#,neurons =neurons)
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=10,scoring='neg_mean_squared_error', n_jobs=-1)
grid_result = grid.fit(x_train, y_train)

In [None]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))


In [None]:
with open('/Users/kevin/Downloads/GridSearchCv_Initial', 'wb') as fp:
    pickle.dump(grid_result.best_params_, fp)