In [24]:
import numpy as np
import pandas as pd
import os
from pathlib import Path

from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import KFold

In [25]:
home = os.getcwd()
home

'/users/qdb16186/dev'

In [26]:
def path_fold(home,resample,i_fold):
    path="{}/CV/{}/fold_{}".format(os.getcwd(),resample,i_fold)
        
    # Define the directory path
    directory_path = Path(f"{home}/CV/{resample}/{i_fold}")
    
    # Ensure the directory exists, create it if necessary
    directory_path.mkdir(parents=True, exist_ok=True)

    return directory_path

def path_resample(home,resample):
    path="{}/CV/{}/".format(os.getcwd(),resample)
        
    # Define the directory path
    directory_path = Path(f"{home}/CV/{resample}")
    
    # Ensure the directory exists, create it if necessary
    directory_path.mkdir(parents=True, exist_ok=True)

    return directory_path

def cv_hp(df,home):
    resample_split  = ShuffleSplit(50, test_size=0.3, random_state=1)
    fold_split      = ShuffleSplit(5 , test_size=0.3, random_state=1)
    train_val_split = ShuffleSplit(1 , test_size=0.3, random_state=1)
    
    for resample, (train_val_index, test_index) in enumerate(resample_split.split(df)):
        train_val = pd.DataFrame(df['ID'].iloc[train_val_index])
        test = pd.DataFrame(df['ID'].iloc[test_index])
        for i, (train_index, val_index) in enumerate(train_val_split.split(train_val)):
            train = pd.DataFrame(df['ID'].iloc[train_index])
            val   = pd.DataFrame(df['ID'].iloc[val_index])
        resample_path = path_resample(home,resample)
        train.to_csv(f'{resample_path}/train.csv')
        val.to_csv(f'{resample_path}/val.csv')
        test.to_csv(f'{resample_path}/test.csv')
        # train,val,test to_csv
        for i_fold, (train_val_fold_index, test_fold_index) in enumerate(fold_split.split(train)):
            train_val_fold = pd.DataFrame(train['ID'].iloc[train_val_fold_index])
            test_fold = pd.DataFrame(train['ID'].iloc[test_fold_index])
            for i, (train_fold_index, val_fold_index) in enumerate(train_val_split.split(train_val_fold)):
                train_fold = pd.DataFrame(train_val_fold['ID'].iloc[train_fold_index])
                val_fold   = pd.DataFrame(train_val_fold['ID'].iloc[val_fold_index])
            i_fold_path = path_fold(home,resample,i_fold)
            train_fold.to_csv(f'{i_fold_path}/train.csv')
            val_fold.to_csv(f'{i_fold_path}/val.csv')
            test_fold.to_csv(f'{i_fold_path}/test.csv')
            

    return print("data organised into 50 CV with 5-fold inner CV")



In [27]:
def access_fold_csv(df,home,resample,fold):
    df_path = path_fold(home,resample,fold)
    train_df=pd.read_csv(f'{df_path}/train.csv')
    val_df=pd.read_csv(f'{df_path}/val.csv')
    test_df=pd.read_csv(f'{df_path}/test.csv')

    train_df=df[df["ID"].isin(train_df['ID'])]
    val_df=df[df["ID"].isin(val_df['ID'])]
    test_df=df[df["ID"].isin(test_df['ID'])]
    return train_df, val_df, test_df


def access_resample_csv(df,home,resample):
    df_path = path_resample(home,resample)
    train_df=pd.read_csv(f'{df_path}/train.csv')
    val_df=pd.read_csv(f'{df_path}/val.csv')
    test_df=pd.read_csv(f'{df_path}/test.csv')

    train_df=df[df["ID"].isin(train_df['ID'])]
    val_df=df[df["ID"].isin(val_df['ID'])]
    test_df=df[df["ID"].isin(test_df['ID'])]
    return train_df, val_df, test_df

In [28]:
def padding(X_descr_train_scaled):
#     Padding function so X data is always 250 dimensions
# Must be coupled with load_data. NB! double check if the scalling is not affected
# https://www.geeksforgeeks.org/python-call-function-from-another-function/
    a=X_descr_train_scaled.to_numpy()
    b=np.zeros((len(X_descr_train_scaled), 
                (250-int(X_descr_train_scaled.to_numpy().shape[1]))
               )
              )
    padded=np.concatenate((a,b),
                           axis=1, 
                          out=None, 
                          dtype=None
                         )
    return padded


def load_xy(file,desc):
    # Universal funciton for loading
# y_1, y_2, y_3, y_4 and x data from input csv (All, Train, Val or Train)
    y_1 = file[['dH']].copy()
    y_2 = file[['dS']].copy()
    y_3 = file[['dG']].copy()
    y_4 = file[['Tm']].copy()

    Y = file[['dH','dS','dG','Tm']].copy()
    # Convert y data into required input shape
    y_1 = y_1.to_numpy()
    y_1 = y_1.reshape(y_1.shape[0])
    y_2 = y_2.to_numpy()
    y_2 = y_2.reshape(y_2.shape[0])
    y_3 = y_3.to_numpy()
    y_3 = y_3.reshape(y_3.shape[0])
    y_4 = y_4.to_numpy()
    y_4 = y_4.reshape(y_4.shape[0])
    
    # Load features based on prop
    X = file[[col for col in file.columns if f'{desc}_'in col]]
    
    return y_1, y_2, y_3, y_4, Y, padding(X), X

    

    

In [29]:
df=pd.read_csv("Lomzov_dataset_IY.csv")
# home=os.getcwd()
# pathlib.Path("Lomzov_dataset_IY.csv").parent.absolute()
# cv_hp(df,home)

resample=1
fold=1
train, val, test = access_resample_csv(df,home,resample)
train_fold, val_fold, test_fold = access_fold_csv(df,home,resample,fold)

desc='CountDNAp'
y_1_train, y_2_train, y_3_train, y_4_train, Y_train, X_padded_train, X_train = load_xy(train_fold,desc)
y_1_val,   y_2_val,   y_3_val,   y_4_val,   Y_val,   X_padded_val,   X_val   = load_xy(val_fold,desc)
y_1_test,  y_2_test,  y_3_test,  y_4_test,  Y_test,  X_padded_test,  X_test  = load_xy(test_fold,desc)


In [30]:
df

Unnamed: 0,ID,Branch,DNA,dH,dS,dG,Tm,RF-Score_C----C,RF-Score_N----C,RF-Score_O----C,...,CountDNA_pos_A,CountDNA_pos_G,CountDNA_pos_C,CountDNA_pos_T,CountDNA_pos_Tot,CountDNAp_pos_A,CountDNAp_pos_G,CountDNAp_pos_C,CountDNAp_pos_T,CountDNAp_pos_Tot
0,1,8DD,aaaaaaaa,-46.3,-134.0,-4.7,16.9,2801,1752,1162,...,8,0,0,0,8,1.000,0.0000,0.0000,0.000,8
1,2,8DD,aaaaaaaaa,-59.8,-173.0,-6.1,27.9,3281,2019,1424,...,9,0,0,0,9,1.000,0.0000,0.0000,0.000,9
2,3,8DD,aaaaaaaaaa,-78.3,-231.0,-6.7,32.0,3753,2309,1601,...,10,0,0,0,10,1.000,0.0000,0.0000,0.000,10
3,4,1A,aactggac,-59.8,-168.0,-7.8,35.7,2656,1372,1441,...,3,2,2,1,8,0.375,0.2500,0.2500,0.125,8
4,5,5AA,aagcgtag,-55.4,-153.0,-8.0,37.0,2646,1418,1441,...,3,3,1,1,8,0.375,0.3750,0.1250,0.125,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300,301,5AA,ttcattcc,-50.0,-140.0,-6.5,28.7,2669,986,1715,...,1,0,3,4,8,0.125,0.0000,0.3750,0.500,8
301,302,1A,ttctggac,-56.2,-158.0,-7.3,32.9,2621,1172,1584,...,1,2,2,3,8,0.125,0.2500,0.2500,0.375,8
302,303,5AA,ttgaagatacgctggc,-119.0,-322.0,-19.0,69.2,5775,2985,3074,...,4,5,3,4,16,0.250,0.3125,0.1875,0.250,16
303,304,1A,ttgtggac,-55.5,-154.0,-7.8,35.8,2610,1236,1551,...,1,3,1,3,8,0.125,0.3750,0.1250,0.375,8


In [38]:
n=100
df.columns[200+n:250+n]

Index(['OHEP_28_pos_7_T', 'OHEP_29_pos_8_A', 'OHEP_30_pos_8_C',
       'OHEP_31_pos_8_G', 'OHEP_32_pos_8_T', 'OHEP_33_pos_9_A',
       'OHEP_34_pos_9_C', 'OHEP_35_pos_9_G', 'OHEP_36_pos_9_T',
       'OHEP_37_pos_10_A', 'OHEP_38_pos_10_C', 'OHEP_39_pos_10_G',
       'OHEP_40_pos_10_T', 'OHEP_41_pos_11_A', 'OHEP_42_pos_11_C',
       'OHEP_43_pos_11_G', 'OHEP_44_pos_11_T', 'OHEP_45_pos_12_A',
       'OHEP_46_pos_12_C', 'OHEP_47_pos_12_G', 'OHEP_48_pos_12_T',
       'OHEP_49_pos_13_A', 'OHEP_50_pos_13_C', 'OHEP_51_pos_13_G',
       'OHEP_52_pos_13_T', 'OHEP_53_pos_14_A', 'OHEP_54_pos_14_C',
       'OHEP_55_pos_14_G', 'OHEP_56_pos_14_T', 'OHEP_57_pos_15_A',
       'OHEP_58_pos_15_C', 'OHEP_59_pos_15_G', 'OHEP_60_pos_15_T',
       'OHEP_61_pos_16_A', 'OHEP_62_pos_16_C', 'OHEP_63_pos_16_G',
       'OHEP_64_pos_16_T', 'OHEP_65_pos_17_A', 'OHEP_66_pos_17_C',
       'OHEP_67_pos_17_G', 'OHEP_68_pos_17_T', 'OHEP_69_pos_18_A',
       'OHEP_70_pos_18_C', 'OHEP_71_pos_18_G', 'OHEP_72_pos_18_T',
    