## ModelsGetCorrs

This file gets the 10 correlations from the 10 folds by importing 10 fitted XGBoost models.

In [10]:
import numpy as np
import pandas as pd
import xgboost as xgb
from scipy import stats
import matplotlib.pyplot as plt
import time

Change to whatever trait of interest.

In [11]:
data = pd.read_feather("C:/Users/gard_/Documents/MasterThesis/ProjectThesis/MyPipeline/Data/Processed/wingBV.feather")
data_models = pd.read_csv("C:/Users/gard_/Documents/MasterThesis/ProjectThesis/MyPipeline/Models/wing_180k_params.csv")
df = pd.read_csv('C:/Users/gard_/Documents/MasterThesis/ProjectThesis/MyPipeline/Data/CVfolds/cv_folds_wing_180k.csv')
file_name = "C:/Users/gard_/Documents/MasterThesis/ProjectThesis/MyPipeline/Results/corr_XGB_wing.csv"

In [12]:
mods = np.empty(10, dtype=object)
# Loop through each row to create and store the model
for i in range(len(data_models)):
    row = data_models.iloc[i]
    
    # Initialize the model with parameters from the current row
    model = xgb.XGBRegressor(
        n_estimators=600,
        learning_rate=row['learning_rate'],
        max_depth=int(row['max_depth']),  
        subsample=row['subsample'],
        colsample_bytree=row['colsample_bytree'],
        min_child_weight=int(row['min_child_weight'])
    )
    
    # Store the model in the numpy array
    mods[i] = model

In [13]:
# X is ringnrs + all SNPs
X_CV = data.drop([
            "ID",
            "mean_pheno",
            "FID",
            "MAT",
            "PAT",
            "SEX",
            "PHENOTYPE",
            "hatchisland"
        ], axis = 1)

# Some of the SNPS have NA-values. Set to 0
X_CV = X_CV.fillna(0)
# Change from float to int64 for all columns not 'ringnr' (i.e. all SNPs)
X_temp = X_CV.drop(['ringnr'], axis = 1)
X_temp = X_temp.T.astype('int64').T
X_temp.insert(0, 'ringnr', X_CV['ringnr'])
X_CV = X_temp

# y is ringnrs + pseudo phenotype
y_CV = data[['ID', 'ringnr']]
y_CV_mean = data[['mean_pheno', 'ringnr']]

In [14]:
def time_convert(seconds):
    hours = seconds // 3600
    minutes = (seconds % 3600) // 60
    seconds = seconds % 60
    return f"{hours} hours, {minutes} minutes and {seconds} seconds"

In [17]:
corr_arr = np.zeros(10)
start_time = time.time()
def get_corrs():
    """Function to run through CV-folds getting correlations
    on predefined models."""
    for i in range(1,11):
        print("Starting run ", i, "at time ", time_convert(time.time() - start_time), "\n")
        test_idx = df[(df['Fold'] == i) & (df['Set'] == 'test')]['ringnr'].values
        train_val_idx = df[(df['Fold'] == i) & (df['Set'] == 'train')]['ringnr'].values
        X_train_val = X_CV[X_CV["ringnr"].isin(train_val_idx)].drop(["ringnr",],axis = 1)
        y_train_val = y_CV[y_CV['ringnr'].isin(train_val_idx)].drop(["ringnr",], axis = 1)
        X_test = X_CV[X_CV["ringnr"].isin(test_idx)].drop(["ringnr",], axis = 1)
        y_mean_test = y_CV_mean[y_CV_mean["ringnr"].isin(test_idx)].drop(["ringnr",], axis = 1)

        model = mods[i-1]
        model.fit(X_train_val, y_train_val, verbose = False)
        predictions = model.predict(X_test)
        res = stats.pearsonr(y_mean_test.iloc[:,0], predictions)[0]
        corr_arr[i-1] = res
        
    return(corr_arr)

In [20]:
corr_arr = get_corrs()

Starting run  1 at time  0.0 hours, 0.0 minutes and 25.360984086990356 seconds 

Starting run  2 at time  0.0 hours, 36.0 minutes and 18.03554892539978 seconds 

Starting run  3 at time  1.0 hours, 5.0 minutes and 33.264954805374146 seconds 

Starting run  4 at time  1.0 hours, 29.0 minutes and 10.655850172042847 seconds 

Starting run  5 at time  2.0 hours, 1.0 minutes and 16.320937633514404 seconds 

Starting run  6 at time  2.0 hours, 41.0 minutes and 24.546459674835205 seconds 

Starting run  7 at time  3.0 hours, 14.0 minutes and 43.190431118011475 seconds 

Starting run  8 at time  3.0 hours, 44.0 minutes and 56.523322105407715 seconds 

Starting run  9 at time  4.0 hours, 20.0 minutes and 59.74850153923035 seconds 

Starting run  10 at time  4.0 hours, 52.0 minutes and 57.83616375923157 seconds 



In [21]:
np.savetxt(file_name, corr_arr, delimiter=",")

Plotting the result if we want.

In [None]:
plt.boxplot(corr_arr)
plt.show()