Atlas and DE Shaw datasets CV splits.

In [1]:
"""
ENV
"""
import numpy as np
from numpy.random import RandomState

"""
Splitting seeds by protein dataset
"""
CV_SEED_GB3 = 243858
CV_SEED_BPTI = 647899
CV_SEED_UBIQ = 187349
CV_SEED_1bxy = 133538
CV_SEED_1bx7 = 988573
CV_SEED_1ptq = 781593

"""
Splitting function
"""
def get_cv_idx_l(seed, dataset_size, k):
    """
    Generates a list of valid set index arrays 
    for k folds.
    """
    rs = RandomState(seed)
    idx = np.arange(dataset_size)
    rs.shuffle(idx)
    idx_l = np.array_split(idx, k)
    return idx_l



In [7]:
"""
Example usage
"""
idx_l = get_cv_idx_l(seed=CV_SEED_GB3,
                     dataset_size=100, 
                     k=5)

ex_data = np.arange(100)

for fold_i in range(5):
    idx = idx_l[fold_i]
    train_mask = np.full(len(ex_data), True, dtype=bool)
    train_mask[idx] = False
    train = ex_data[train_mask]
    valid = ex_data[idx]

    print('fold', fold_i + 1)
    print('train', train)
    print('valid', valid, '\n')

fold 1
train [ 0  1  2  3  4  6  8  9 10 11 12 13 15 16 17 18 19 20 21 22 23 24 25 27
 28 29 30 31 32 33 35 36 37 38 39 40 42 43 45 46 47 48 50 51 52 53 54 56
 57 58 59 61 62 64 66 67 69 70 71 72 73 75 76 77 81 82 83 85 86 87 88 89
 90 91 92 93 95 97 98 99]
valid [65 74  7 68 96 34 79 78 14 63 60 84 55 49 41 80 44 94  5 26] 

fold 2
train [ 3  4  5  6  7  8  9 10 12 13 14 16 17 18 19 20 21 22 23 24 26 27 28 29
 31 33 34 35 36 37 38 40 41 42 44 45 46 47 48 49 50 51 53 54 55 56 57 58
 60 63 64 65 66 67 68 69 72 74 75 76 77 78 79 80 82 83 84 85 86 87 88 90
 91 93 94 95 96 97 98 99]
valid [ 2 11 32 43 89 92 81  0 73 71 30 52 61 25 15 59 39 70  1 62] 

fold 3
train [ 0  1  2  4  5  6  7 10 11 12 13 14 15 16 17 18 19 20 21 22 25 26 28 29
 30 31 32 33 34 35 36 37 39 40 41 42 43 44 46 48 49 51 52 53 55 56 59 60
 61 62 63 65 68 70 71 72 73 74 75 76 78 79 80 81 83 84 85 86 87 88 89 90
 91 92 93 94 96 97 98 99]
valid [67 27  3 54 69 82 23 24 64 47 77 95 45  8 66 50  9 58 38 57] 

fold 4
train [ 0