# dRFEtools simulations with phenotype ~ genotypes

In [None]:
import os,errno
import functools
import dRFEtools
import numpy as np
import pandas as pd
from time import time
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.metrics import explained_variance_score as evar

## Functions

In [None]:
@functools.lru_cache()
def get_y_var():
    # Correlated component
    Ycorr = pd.read_csv("../../_m/genotype_simulation/Y_correlatedBg_genotype_simulation.csv", index_col=0)
    # Genetic component
    YgenBg = pd.read_csv("../../_m/genotype_simulation/Y_genBg_genotype_simulation.csv", index_col=0)
    YgenFixed = pd.read_csv("../../_m/genotype_simulation/Y_genFixed_genotype_simulation.csv", index_col=0)
    # Noise component
    YnoiseBg = pd.read_csv("../../_m/genotype_simulation/Y_noiseBg_genotype_simulation.csv", index_col=0)
    YnoiseFixed = pd.read_csv("../../_m/genotype_simulation/Y_noiseFixed_genotype_simulation.csv", index_col=0)
    # Combine
    Y = Ycorr + YgenBg + YgenFixed + YnoiseBg + YnoiseFixed
    return Y


@functools.lru_cache()
def get_X_var():
    snp_df = pd.read_csv("../../_m/genotype_simulation/Genotypes_genotype_simulation.csv", 
                         index_col=0).T
    r = pd.get_dummies(snp_df, columns=snp_df.columns, dummy_na=True)
    r.columns = r.columns.str.replace('\.\d+', '', regex=True)
    return r

In [None]:
def mkdir_p(directory):
    try:
        os.makedirs(directory)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

            
def load_data(simu):
    X = get_X_var()
    Y = get_y_var().iloc[:, simu]
    return X,Y


def run_regr_oob(estimator, x_train, x_test, y_train, y_test, fold, outdir, 
                 frac, step, simu):
    features = x_train.columns
    d, pfirst = dRFEtools.rf_rfe(estimator, x_train.values, y_train.values, features, 
                                fold, outdir, elimination_rate=0.1, RANK=True)
    df_elim = pd.DataFrame([{'fold':fold, "simulation": simu,
                             'n features':k, 'R2 Score':d[k][1], 
                             'Mean Square Error':d[k][2], 
                             'Explained Variance':d[k][3]} for k in d.keys()])
    n_features_max = max(d, key=lambda x: d[x][1])
    try:
        ## Max features from lowess curve
        n_features, _ = dRFEtools.extract_max_lowess(d, frac=frac, multi=False)
        n_redundant, _ = dRFEtools.extract_redundant_lowess(d, frac=frac, 
                                                            step_size=step, 
                                                            multi=False)
        dRFEtools.plot_with_lowess_vline(d, fold, outdir, frac=frac, 
                                         step_size=step, classify=False)
    except ValueError:
        ## For errors in lowess estimate
        n_features = n_features_max 
        n_redundant = n_features
    ## Fit model
    estimator.fit(x_train, y_train)
    all_fts = estimator.predict(x_test)
    estimator.fit(x_train.values[:, d[n_redundant][4]], y_train)
    labels_pred_redundant = estimator.predict(x_test.values[:, d[n_redundant][4]])
    estimator.fit(x_train.values[:,d[n_features][4]], y_train)
    labels_pred = estimator.predict(x_test.values[:, d[n_features][4]])
    ## Output test predictions
    pd.DataFrame({'fold': fold, "simulation": simu, 'real': y_test, 
                  'predict_all': all_fts, 'predict_max': labels_pred, 
                  'predict_redundant': labels_pred_redundant})\
      .to_csv("%s/test_predictions.txt" % outdir, sep='\t', mode='a', index=True, 
              header=True if fold == 0 else False)
    output = dict()
    output['simulation'] = simu
    output['n_features'] = n_features
    output['n_redundant'] = n_redundant
    output['n_max'] = n_features_max
    output['train_r2'] = dRFEtools.oob_score_r2(estimator, y_train)
    output['train_mse'] = dRFEtools.oob_score_mse(estimator, y_train)
    output['train_evar'] = dRFEtools.oob_score_evar(estimator, y_train)
    output['test_r2'] = r2_score(y_test, labels_pred)
    output['test_mse'] = mean_squared_error(y_test, labels_pred)
    output['test_evar'] = evar(y_test, labels_pred, multioutput='uniform_average')
    metrics_df = pd.DataFrame.from_records(output, index=[simu]).reset_index().drop('index', axis=1)
    return df_elim, metrics_df


def run_regr_dev(estimator, x_train, x_test, y_train, y_test, fold, outdir, 
                 frac, step, simu):
    features = x_train.columns
    d, pfirst = dRFEtools.dev_rfe(estimator, x_train.values, y_train.values, features, 
                                 fold, outdir, elimination_rate=0.1, RANK=True)
    df_elim = pd.DataFrame([{'fold':fold, "simulation": simu,
                             'n features':k, 'R2 Score':d[k][1], 
                             'Mean Square Error':d[k][2], 
                             'Explained Variance':d[k][3]} for k in d.keys()])
    n_features_max = max(d, key=lambda x: d[x][1])
    try:
        ## Max features from lowess curve
        ### multiple classification is False by default
        n_features, _ = dRFEtools.extract_max_lowess(d, frac=frac)
        n_redundant, _ = dRFEtools.extract_redundant_lowess(d, frac=frac, 
                                                            step_size=step)
        dRFEtools.plot_with_lowess_vline(d, fold, outdir, frac=frac, 
                                         step_size=step, classify=False)
    except ValueError:
        ## For errors in lowess estimate
        n_features = n_features_max 
        n_redundant = n_features
    ## Fit model
    x_dev, x_test, y_dev, y_test = train_test_split(x_train, y_train)
    estimator.fit(x_train, y_train)
    all_fts = estimator.predict(x_test)
    estimator.fit(x_train.values[:, d[n_redundant][4]], y_train)
    labels_pred_redundant = estimator.predict(x_test.values[:, d[n_redundant][4]])
    estimator.fit(x_train.values[:,d[n_features][4]], y_train)
    labels_pred = estimator.predict(x_test.values[:, d[n_features][4]])
    ## Output test predictions
    pd.DataFrame({'fold': fold, "simulation": simu, 'real': y_test, 
                  'predict_all': all_fts, 'predict_max': labels_pred, 
                  'predict_redundant': labels_pred_redundant})\
      .to_csv("%s/test_predictions.txt" % outdir, sep='\t', mode='a', index=True, 
              header=True if fold == 0 else False)
    output = dict()
    output['simulation'] = simu
    output['fold'] = fold
    output['n_features'] = n_features
    output['n_redundant'] = n_redundant
    output['n_max'] = n_features_max
    output['train_r2'] = dRFEtools.dev_score_r2(estimator, x_dev.values[:,d[n_features][4]], y_dev)
    output['train_mse'] = dRFEtools.dev_score_mse(estimator, x_dev.values[:,d[n_features][4]], y_dev)
    output['train_evar'] = dRFEtools.dev_score_evar(estimator, x_dev.values[:,d[n_features][4]], y_dev)
    output['test_r2'] = r2_score(y_test, labels_pred)
    output['test_mse'] = mean_squared_error(y_test, labels_pred)
    output['test_evar'] = evar(y_test, labels_pred, multioutput='uniform_average')
    metrics_df = pd.DataFrame.from_records(output, index=[simu]).reset_index().drop('index', axis=1)
    return df_elim, metrics_df

## Generate 10-fold cross-validation

In [None]:
cv = KFold(n_splits=10, shuffle=True, random_state=13)

### Ridge

#### Initialize

In [None]:
outdir = 'ridge/'
mkdir_p(outdir)
regr = dRFEtools.Ridge(random_state=13)

#### Optimize

In [None]:
X, y = load_data(0)
fold = 1
for train_index, test_index in cv.split(X, y):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train, y_test = y[train_index], y[test_index]
    fold += 1
fold -= 1

features = X_train.columns
d, pfirst = dRFEtools.dev_rfe(regr, X_train.values, y_train.values, features, 
                             fold, outdir, elimination_rate=0.1, RANK=False)

for frac in [0.2, 0.25, 0.3, 0.35]:
    dRFEtools.optimize_lowess_plot(d, fold, outdir, frac=frac, step_size=0.05, 
                                   classify=False, save_plot=False)

In [None]:
for step_size in [0.01, 0.02, 0.03, 0.04]:
    dRFEtools.optimize_lowess_plot(d, fold, outdir, frac=0.3, step_size=step_size, 
                                   classify=False, save_plot=False)

#### Dynamic run

In [None]:
cpu_lt = []; simu_lt = []
for simu in range(15):
    X, y = load_data(simu)
    simu_out = "%s/simulate_%d" % (outdir, simu)
    mkdir_p(simu_out)
    frac = 0.3; step=0.04; fold = 0
    df_dict = pd.DataFrame()
    output = pd.DataFrame()
    start = time()
    for train_index, test_index in cv.split(X, y):
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y[train_index], y[test_index]
        df_elim, metrics_df = run_regr_dev(regr, X_train, X_test, 
                                           y_train, y_test, fold, 
                                           simu_out, frac, step, simu)
        df_dict = pd.concat([df_dict, df_elim], axis=0)
        output = pd.concat([output, metrics_df], axis=0)
        fold += 1
    end = time()
    df_dict.to_csv("%s/dRFE_simulation_elimination.txt" % outdir,
                   sep='\t', mode='a', index=False, 
                   header=True if simu == 0 else False)
    output.to_csv("%s/dRFE_simulation_metrics.txt" % outdir,
                  sep='\t', mode='a', index=False, 
                  header=True if simu == 0 else False)
    cpu_lt.append(end - start)
    simu_lt.append(simu)
pd.DataFrame({"Simulation": simu_lt, "CPU Time": cpu_lt})\
  .to_csv("%s/simulation_time.csv" % outdir, index=False)

### Elastic Net

#### Initialize

In [None]:
outdir = 'enet/'
mkdir_p(outdir)
regr = dRFEtools.ElasticNet(alpha=0.01, random_state=13)

#### Optimize

In [None]:
X, y = load_data(0)
fold = 1
for train_index, test_index in cv.split(X, y):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train, y_test = y[train_index], y[test_index]
    fold += 1
fold -= 1

features = X_train.columns
d, pfirst = dRFEtools.dev_rfe(regr, X_train.values, y_train.values, features, 
                             fold, outdir, elimination_rate=0.1, RANK=False)

for frac in [0.2, 0.25, 0.3, 0.35]:
    dRFEtools.optimize_lowess_plot(d, fold, outdir, frac=frac, step_size=0.05, 
                                   classify=False, save_plot=False)

In [None]:
for step_size in [0.01, 0.02, 0.03, 0.04]:
    dRFEtools.optimize_lowess_plot(d, fold, outdir, frac=0.3, step_size=step_size, 
                                   classify=False, save_plot=False)

#### Dynamic run

In [None]:
cpu_lt = []; simu_lt = []
for simu in range(15):
    X, y = load_data(simu)
    simu_out = "%s/simulate_%d" % (outdir, simu)
    mkdir_p(simu_out)
    frac = 0.3; step=0.01; fold = 0
    df_dict = pd.DataFrame()
    output = pd.DataFrame()
    start = time()
    for train_index, test_index in cv.split(X, y):
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y[train_index], y[test_index]
        df_elim, metrics_df = run_regr_dev(regr, X_train, X_test, 
                                           y_train, y_test, fold, 
                                           simu_out, frac, step, simu)
        df_dict = pd.concat([df_dict, df_elim], axis=0)
        output = pd.concat([output, metrics_df], axis=0)
        fold += 1
    end = time()
    df_dict.to_csv("%s/dRFE_simulation_elimination.txt" % outdir,
                   sep='\t', mode='a', index=False, 
                   header=True if simu == 0 else False)
    output.to_csv("%s/dRFE_simulation_metrics.txt" % outdir,
                  sep='\t', mode='a', index=False, 
                  header=True if simu == 0 else False)
    cpu_lt.append(end - start)
    simu_lt.append(simu)
pd.DataFrame({"Simulation": simu_lt, "CPU Time": cpu_lt})\
  .to_csv("%s/simulation_time.csv" % outdir, index=False)

### SVR linear kernel

#### Initialize

In [None]:
outdir = 'svr/'
mkdir_p(outdir)
regr = dRFEtools.LinearSVR(random_state=13, max_iter=10000)

#### Optimize

In [None]:
X, y = load_data(0)
fold = 1
for train_index, test_index in cv.split(X, y):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train, y_test = y[train_index], y[test_index]
    fold += 1
fold -= 1

features = X_train.columns
d, pfirst = dRFEtools.dev_rfe(regr, X_train.values, y_train.values, features, 
                             fold, outdir, elimination_rate=0.1, RANK=False)

for frac in [0.2, 0.25, 0.3, 0.35]:
    dRFEtools.optimize_lowess_plot(d, fold, outdir, frac=frac, step_size=0.05, 
                                   classify=False, save_plot=False)

In [None]:
for step_size in [0.01, 0.02, 0.03, 0.04]:
    dRFEtools.optimize_lowess_plot(d, fold, outdir, frac=0.25, step_size=step_size, 
                                   classify=False, save_plot=False)

#### Dynamic run

In [None]:
cpu_lt = []; simu_lt = []
for simu in range(15):
    simu_out = "%s/simulate_%d" % (outdir, simu)
    mkdir_p(simu_out)
    X, y = load_data(simu)
    frac = 0.20; step=0.03; fold = 0
    df_dict = pd.DataFrame()
    output = pd.DataFrame()
    start = time()
    for train_index, test_index in cv.split(X, y):
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y[train_index], y[test_index]
        df_elim, metrics_df = run_regr_dev(regr, X_train, X_test, 
                                           y_train, y_test, fold, 
                                           simu_out, frac, step, simu)
        df_dict = pd.concat([df_dict, df_elim], axis=0)
        output = pd.concat([output, metrics_df], axis=0)
        fold += 1
    end = time()
    df_dict.to_csv("%s/dRFE_simulation_elimination.txt" % outdir,
                   sep='\t', mode='a', index=False, 
                   header=True if simu == 0 else False)
    output.to_csv("%s/dRFE_simulation_metrics.txt" % outdir,
                  sep='\t', mode='a', index=False, 
                  header=True if simu == 0 else False)
    cpu_lt.append(end - start)
    simu_lt.append(simu)
pd.DataFrame({"Simulation": simu_lt, "CPU Time": cpu_lt})\
  .to_csv("%s/simulation_time.csv" % outdir, index=False)

### Random forest

#### Initialize

In [None]:
outdir = 'rf/'
mkdir_p(outdir)
regr = dRFEtools.RandomForestRegressor(n_estimators=100, oob_score=True, 
                                       n_jobs=-1, random_state=13)

#### Optimize

In [None]:
X, y = load_data(0)
fold = 1
for train_index, test_index in cv.split(X, y):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train, y_test = y[train_index], y[test_index]
    fold += 1
fold -= 1

features = X_train.columns
d, pfirst = dRFEtools.rf_rfe(regr, X_train.values, y_train.values, features, 
                            fold, outdir, elimination_rate=0.2, RANK=False)

for frac in [0.2, 0.25, 0.3, 0.35]:
    dRFEtools.optimize_lowess_plot(d, fold, outdir, frac=frac, step_size=0.05, 
                                   classify=False, save_plot=False)

In [None]:
for step_size in [0.01, 0.02, 0.03, 0.04]:
    dRFEtools.optimize_lowess_plot(d, fold, outdir, frac=0.3, step_size=step_size, 
                                   classify=False, save_plot=False)

#### Dynamic run

In [None]:
cpu_lt = []; simu_lt = []
for simu in range(15):
    X, y = load_data(simu)
    simu_out = "%s/simulate_%d" % (outdir, simu)
    mkdir_p(simu_out)
    frac = 0.3; step=0.04; fold = 0
    df_dict = pd.DataFrame()
    output = pd.DataFrame()
    start = time()
    for train_index, test_index in cv.split(X, y):
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y[train_index], y[test_index]
        df_elim, metrics_df = run_regr_oob(regr, X_train, X_test, 
                                           y_train, y_test, fold, 
                                           simu_out, frac, step, simu)
        df_dict = pd.concat([df_dict, df_elim], axis=0)
        output = pd.concat([output, metrics_df], axis=0)
        fold += 1
    end = time()
    df_dict.to_csv("%s/dRFE_simulation_elimination.txt" % outdir,
                   sep='\t', mode='a', index=False, 
                   header=True if simu == 0 else False)
    output.to_csv("%s/dRFE_simulation_metrics.txt" % outdir,
                  sep='\t', mode='a', index=False, 
                  header=True if simu == 0 else False)
    cpu_lt.append(end - start)
    simu_lt.append(simu)
pd.DataFrame({"Simulation": simu_lt, "CPU Time": cpu_lt})\
