In [8]:
import random
from training_utils.file_utils import open_json, write_json
from training_utils.model_utils import load_dataset, compute_feature_list
from sklearn.model_selection import KFold

In [2]:
config = open_json("simple_nn_config.json")
DIR_PATH = config["dataset_dir"]
features_dict = open_json(f"{DIR_PATH}/features.json")
features, features_infos = compute_feature_list(config, features_dict)

df = load_dataset(config, features)

training on 6012 data


In [9]:
# Python program to find number of folds required using

def shuffle_dict(d: dict):
    keys = list(d.keys())
    random.shuffle(keys)
    return {key: d[key] for key in keys}



def GroupFit(weight: dict, max_weight: int, ksplit: int):
    # First Fit Decreasing algorithm.
    # Returns number of folds required using first fit
    # This is an "online" algorithm, but if we give it a sorted list it is optimal

    # Initialize result (Count of folds)
    res = 0
    folds = [[] for k in range(ksplit)]
    weight_values = list(weight.values())
    weight_keys = list(weight.keys())

    # Create an array to store remaining space in folds
    # there can be at most len(weight) folds
    groups_remaining_space = [0]*ksplit

    # Place items one by one
    for i in range(len(weight)):
        # Find the first fold that can accommodate
        # weight[i]
        j = 0
        while (j < res):
            if (groups_remaining_space[j] >= weight_values[i]):
                groups_remaining_space[j] = groups_remaining_space[j] - \
                    weight_values[i]
                folds[j].append(weight_keys[i])
                break
            j += 1

        # If no fold could accommodate weight[i]
        if (j == res):
            groups_remaining_space[res] = max_weight - weight_values[i]
            folds[res].append(weight_keys[i])
            res = res+1
    return folds


# Test program
weight = {
    "a": 2,
    "b": 5,
    "c": 7,
    "d": 1,
    "e": 3,
    "f": 8,
    "g": 4,
}
print(shuffle_dict(weight))
print(f"total weight = {sum(weight.values())}")
# we want 3 folds of 10, so we put max_weight to 11 (some tolerance)
ksplit = 3
max_weight = 12
weight = {k: v for k, v in sorted(
    weight.items(), key=lambda item: item[1], reverse=False)}
folds = GroupFit(weight, max_weight, ksplit)
for fold in folds:
    print(fold)
    print([weight[i] for i in fold])


{'c': 7, 'd': 1, 'f': 8, 'b': 5, 'a': 2, 'e': 3, 'g': 4}
total weight = 30
['d', 'a', 'e', 'g']
[1, 2, 3, 4]
['b', 'c']
[5, 7]
['f']
[8]


In [6]:
# apply the former algorithm to our case: here we sort the weights so it is optimal

weight = df.alphafold_path.value_counts(sort=True).to_dict()

print(f"total weight = {sum(weight.values())}")
# we want 5 folds of 6012/5, so we put max_weight to 1203 (exact would be 1202.5)
ksplit = 5
max_weight = 1203
folds = GroupFit(weight, max_weight, ksplit)
sorted_fixed_ksplit = {}
for k, fold in enumerate(folds):
    sorted_fixed_ksplit.update({v: k for v in fold})

write_json("./ksplit/sorted_fixed_ksplit.json", sorted_fixed_ksplit)


total weight = 6012


In [13]:
# apply the former algorithm to our case: less optimal here (unsorted)
x = 0
while x< 10:
    print(x)
    weight = df.alphafold_path.value_counts(sort=False).to_dict()
    weight = shuffle_dict(weight)
    print(f"total weight = {sum(weight.values())}")
    # we want 5 folds of 6012/5, so we put max_weight to 1205 (exact would be 1202.5)
    ksplit = 5
    max_weight = 1205
    try:
        folds = GroupFit(weight, max_weight, ksplit)
    except:
        # could not do a split for this distribution
        continue
    unsorted_fixed_ksplit = {}
    for k, fold in enumerate(folds):
        unsorted_fixed_ksplit.update({v: k for v in fold})

    write_json(f"./ksplit/shuffled_{x}_fixed_ksplit.json", unsorted_fixed_ksplit)
    x += 1


0
total weight = 6012
1
total weight = 6012
2
total weight = 6012
2
total weight = 6012
3
total weight = 6012
4
total weight = 6012
5
total weight = 6012
6
total weight = 6012
7
total weight = 6012
8
total weight = 6012
9
total weight = 6012


In [11]:
fixed_ksplit = open_json("./ksplit/shuffled_fixed_ksplit.json")
df = load_dataset(config, features)
df["kfold"] = df["alphafold_path"].apply(
    lambda x: fixed_ksplit[x])
for k in range(5):
    print(sum(df[df.kfold.eq(k)].alphafold_path.value_counts().tolist()))


training on 6012 data
1203
1203
1203
1203
1200
