In [1]:
from timeit import default_timer as timer
from datetime import timedelta

import math
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from scipy.io import loadmat

import copy
import glob

In [2]:
subj = loadmat(os.getcwd() + '/' + 'sub_id.mat')
subid_arr = subj['Subject']

subid_list = []
for subid in subid_arr:
    subid_list.append(subid[0])

struct_p = "structures.mat"
dist_p = "distance.csv"
cur_dir = os.getcwd()

In [3]:
def transpose_add_flatten(dti_matrix):
    ######## REMEMBER TO TAKE THE TANSPOSE, ADD IT TO THE ORIGINAL ONE THEN DIVIDE BY 2
    dti_transposed = dti_matrix.transpose()
    dti = (dti_matrix + dti_transposed) / 2
    dti_arr = dti.flatten()  ## FLATTEN IT

    return dti_arr

In [4]:
def read_structure(PATH, head):
    if head:
        structure = pd.read_csv(PATH, header=0)
    else:
        structure = pd.read_csv(PATH, header=None)
    
    structure_arr = structure.to_numpy()
    
    thickness = structure_arr[:,0]    

    myelination = structure_arr[:, 1]
    curvature = structure_arr[:, 2]
    sulcus_depth = structure_arr[:, 3]
    
    thickness_pairs = []
    for i in thickness:
        for j in thickness:
            diff = abs(i - j)
            thickness_pairs.append(diff)

    T_pairs = np.array(thickness_pairs)
    T_pairs = (T_pairs - min(T_pairs)) / (max(T_pairs) - min(T_pairs))
    T_pairs = 1 - T_pairs

    myelination_pairs = []
    for i in myelination:
        for j in myelination:
            diff = abs(i - j)
            myelination_pairs.append(diff)

    M_pairs = np.array(myelination_pairs)
    M_pairs = (M_pairs - min(M_pairs)) / (max(M_pairs) - min(M_pairs))
    M_pairs = 1 - M_pairs

    curvature_pairs = []
    for i in curvature:
        for j in curvature:
            diff = abs(i - j)
            curvature_pairs.append(diff)
    
    C_pairs = np.array(curvature_pairs)
    C_pairs = (C_pairs - min(C_pairs)) / (max(C_pairs) - min(C_pairs))
    C_pairs = 1 - C_pairs

    #
    sulcus_pairs = []
    for i in sulcus_depth:
        for j in sulcus_depth:
            diff = abs(i - j)
            sulcus_pairs.append(diff)

    S_pairs = np.array(sulcus_pairs)
    S_pairs = (S_pairs - min(S_pairs)) / (max(S_pairs) - min(S_pairs))
    S_pairs = 1 - S_pairs
    
    
    X_4F = np.column_stack((T_pairs, M_pairs))
    X_4F = np.column_stack((X_4F, C_pairs))
    X_4F = np.column_stack((X_4F, S_pairs))
    
    return X_4F

In [5]:
def compute_threshold(arr, percentile):
    arr_copy = copy.deepcopy(arr)
    sorted_arr = np.sort(arr_copy)
    idx = int(np.floor(len(sorted_arr) * percentile))
    if percentile == 1:
        idx = len(arr) - 1
    threshold = sorted_arr[idx]
#     print(idx)
    
    return threshold

In [6]:
def flattened_to_matrix(flattened):
    side_len = int(len(flattened) ** 0.5)
    matrix = np.zeros((side_len, side_len))
    print("side length: " + str(side_len))
#     print("--")
    for i in range(side_len):
        for j in range(side_len):
            idx = i * side_len + j
#             if idx == 32400:
#                 print("wtf")
#                 break
            matrix[i, j] = flattened[idx]
            
    return matrix

In [7]:
# COMPUTE TOPOLOGICAL SIMILARITY BASED ON ADJACENCY MATRIX
# THE EQUATION USED IS 
#                      Tij = common neighbors of i&j / neighbors of i

def to_topo(adjacency_matrix):

    connection_list = []
    
    side_len = len(adjacency_matrix[0])
    print(side_len)

    for i in range(side_len):
        nonzero_indices = list(np.nonzero(adjacency_matrix[i]))
        connection_list.append(nonzero_indices)
    
    T = np.zeros((side_len, side_len))

    for i in range(side_len):
        a = connection_list[i][0]
        a = set(a)
    
        for j in range(side_len):
            b = connection_list[j][0]
            b = set(b)
        
            common_connections = set(a).intersection(b)
            union_connections = set(a).union(b)
            
            topo_similarity = 0
            
            if len(a) + len(b) != 0:
                topo_similarity = sum(common_connections) / sum(union_connections)
#                 topo_similarity = len(common_connections) / ((len(a) + len(b)) / 2)
                
#             print(topo_similarity)
            T[i, j] = topo_similarity
            
    return T

In [8]:
def save_localStructures(PATH, sub_idx, subid_list, struct_matlab_path):
    struct = loadmat(PATH + '/' + struct_matlab_path)
    structures = struct['structures']
    inter = structures[sub_idx,:,0:4]

    structure_csv = "localstructure_" + str(subject_ID) + ".csv"
    a = np.savetxt("dp_intermediate/" + structure_csv, inter, delimiter=",",fmt = "%s")

In [9]:
def backboneDTI_to_topo(flattened_dti):
    AM = flattened_to_matrix(flattened_dti)
    topo = to_topo(AM)
    return topo

In [10]:
def removeByCoverage(coverage):
    idx_removed = []
    if coverage != "whole":
        side_len = 180
        same_num = side_len ** 2
        if coverage == "left":
            for i in range(side_len):
                for j in range(side_len):
                    idx_removed.append(i * (2 * side_len) + side_len + j)
            for k in range(same_num * 2):
                idx_removed.append(k + same_num * 2)
        elif coverage == "right":
            for i in range(same_num * 2):
                idx_removed.append(i)
            for j in range(side_len):
                for k in range(side_len):
                    idx_removed.append(same_num * 2 + j * 360 + k)
        elif coverage == "contra":
            for i in range(2 * same_num):
                idx_removed.append(2 * same_num + i)
            for j in range(side_len):
                for k in range(side_len):
                    idx_removed.append(j * 360 + k)
    
    return idx_removed

In [11]:
def dti_MatToCsv(mat_path, save_to, sub_id):
    m = loadmat(mat_path)
    dti = m['DTI']
    dti = dti.flatten()
    saved_name = save_to + "/" + str(sub_id) + ".csv"
    path = os.path.normpath(saved_name)
    DTI_CSV = np.savetxt(path, dti, delimiter = ",")

In [12]:
def chunks(l, n):
    n = max(1, n)
    return list(l[i:i+n] for i in range(0, len(l), n))

In [13]:
def scaling(Y, is_log, beta):
    if is_log == False:
        Y = Y ** beta
        print("beta:  " +  str(scaling_params["beta"])   + "Y_max_beta: " + str(Y.max()))
    else:
        Y = Y + 1e-09
        Y = np.log(Y)
        print("log:  " + "Y_max_log: " + str(Y.max()))
    Y = (Y - Y.min()) / (Y.max() - Y.min())
#     print("yo")
    return Y

In [None]:
def to_csv_datataset_graph(sub_idx, subid_list, path_dict, scaling_params):
    print("graph")
    subject_ID = subid_list[sub_idx]
    PATH = path_dict["PATH"]
    output_folder = PATH

    distance_matrix = pd.read_csv(path_dict["distance_path"], header=None)
    dist_arr = distance_matrix.to_numpy()
    spatial_proximity = dist_arr.flatten()

    spatial_proximity = (spatial_proximity - spatial_proximity.min()) / (spatial_proximity.max() - spatial_proximity.min()) 

    dti = pd.read_csv(PATH + "/dti_csv/dti_mat/" + str(subject_ID) + "_MMP_matrix.csv", header=None)
    dti_np = dti.to_numpy()
    dti = transpose_add_flatten(dti_np)
    print(np.shape(dti))
    
    structure_csv = "localStructures_" + str(subject_ID) + ".csv"
    structures = read_structure(PATH + "/structures_csv/" + structure_csv, 0)
    
    if scaling_params["isAverage"]:
        dti_avg = pd.read_csv(PATH + "/dti_998avg.csv", header=None)
        dti_avg = dti_avg.to_numpy().flatten()
        dti_mat_avg = flattened_to_matrix(dti_avg)
        dti = transpose_add_flatten(dti_mat_avg)
        
        structures = read_structure(PATH + "/avg_structures.csv", 0)
        output_folder += "/GraphAvgData/"
        print("avg!graph")
    else:
        output_folder += "/GraphData/"

    dti_copy = copy.deepcopy(dti)

    if scaling_params["weigh_topo"] == True: 
        # apply log scale on weight to compute topological similarity
        dti_copy = dti_copy + 1e-09
        dti_copy = np.log(dti_copy)
        dti_copy = (dti_copy - dti_copy.min()) / (dti_copy.max() - dti_copy.min())
    
    max_scaleKnown = 1 / 10**(min(scaling_params["scales_known"]))
    min_scaleKnown = 1 / 10**(max(scaling_params["scales_known"]))
    max_scaleToPredict = 1 / 10**(min(scaling_params["scales_toPredict"]))
    min_scaleToPredict = 1 / 10**(max(scaling_params["scales_toPredict"]))
    if max(scaling_params["scales_toPredict"]) == 8:
        min_scaleToPredict = 0
    
    # Turn dti_copy to binary array with 1 for known links and 0 for unknown
    dti_copy = np.where(dti_copy <= 10 * max_scaleKnown, dti_copy, 0)
    dti_copy = np.where(dti_copy >= min_scaleKnown, dti_copy, 0)
    if scaling_params["weigh_topo"] == False:
        dti_copy = np.where(dti_copy > 0, 1, 0)

    adj_mat = flattened_to_matrix(dti_copy)
    
    output_folder += process_folderName(scaling_params)
    print(output_folder)
    if not scaling_params["isAverage"]:
        if path_dict["isTrain"] == True:
            output_folder += "/Train"
        else:
            output_folder += "/Test"
    print(output_folder)
    
    output_path = output_folder + "/"
    
    if scaling_params["isAverage"]:
        output_path += "AVG_AM.csv"
    else:
        output_path += str(subject_ID) + "_AM.csv"

    print(output_path)
    output = np.savetxt(output_path, adj_mat, delimiter=",",fmt = "%s")


In [14]:
def to_csv_dataset_avg(path_dict, scaling_params):
    print("avg!")
    PATH = path_dict["PATH"]
    distance_matrix = pd.read_csv(path_dict["distance_path"], header=None)

    dist_arr = distance_matrix.to_numpy()
    spatial_proximity = dist_arr.flatten()
    
    #################################### SHOULD NORMALIZE LATER !!!!!!!!!!!!! SUBJECT TO CHANGES
    spatial_proximity = (spatial_proximity - spatial_proximity.min()) / (spatial_proximity.max() - spatial_proximity.min()) 
    #normalize
    
    dti_avg = pd.read_csv(PATH + "/dti_998avg.csv", header=None)
    dti_avg = dti_avg.to_numpy().flatten()
    dti_mat_avg = flattened_to_matrix(dti_avg)
    dti = transpose_add_flatten(dti_mat_avg)
        
    structures = read_structure(PATH + "/structures.csv", 0)
    dti_copy = copy.deepcopy(dti)

    if scaling_params["weigh_topo"] == True: 
        # apply log scale on weight to compute topological similarity
        dti_copy = dti_copy + 1e-09
        dti_copy = np.log(dti_copy)
        dti_copy = (dti_copy - dti_copy.min()) / (dti_copy.max() - dti_copy.min())
    
    max_scaleKnown = 1 / 10**(min(scaling_params["scales_known"]))
    min_scaleKnown = 1 / 10**(max(scaling_params["scales_known"]))
    max_scaleToPredict = 1 / 10**(min(scaling_params["scales_toPredict"]))
    min_scaleToPredict = 1 / 10**(max(scaling_params["scales_toPredict"]))
    if max(scaling_params["scales_toPredict"]) == 8:
        min_scaleToPredict = 0
    
    max_scaleKnown_c = -np.floor(np.log10(max_scaleKnown)).astype(int)
    min_scaleKnown_c = -np.floor(np.log10(min_scaleKnown)).astype(int)
    max_scaleToPredict_c = -np.floor(np.log10(max_scaleToPredict)).astype(int)
    min_scaleToPredict_c = -1
    if min_scaleToPredict == 0:
        min_scaleToPredict_c = 8
    else:
        min_scaleToPredict_c = -np.floor(np.log10(min_scaleToPredict)).astype(int)
        
    ############# COMPUTE TOPO
    
    # Turn dti_copy to binary array with 1 for known links and 0 for unknown
    dti_copy = np.where(dti_copy <= 10 * max_scaleKnown, dti_copy, 0)
    dti_copy = np.where(dti_copy >= min_scaleKnown, dti_copy, 0)
    if scaling_params["weigh_topo"] == False:
        dti_copy = np.where(dti_copy > 0, 1, 0)

    topo_arr = backboneDTI_to_topo(dti_copy)
    topo_arr = topo_arr.flatten()
    ####
    X_t = np.column_stack((spatial_proximity, structures))
    X_t = np.column_stack((X_t, topo_arr))
    
    ############# COMPUTE TOPO, DONE
    
    ############# PRUNE AND LEAVE ONLY LINKS TO PREDICT
    
    print("ymax_beforeRemovingStrongLinks_wholeBrain: " + str(dti.max()))
#     print("x")
#     dti_copy = np.delete(dti_copy, idx_removed_byCoverage, 0)
#     print("ymax_beforeRemovingStrongLinks: " + str(dti.max()))

    idx_exceptToPredict = []
    for i in range(len(dti)):
        if (dti[i] < max_scaleToPredict*10 and dti[i] >= min_scaleToPredict) == False:
            idx_exceptToPredict.append(i)
    print(len(idx_exceptToPredict))
    
    idx_removed_byCoverage = removeByCoverage(scaling_params["coverage"])
    print(len(idx_removed_byCoverage))
    
    idx_removed = idx_removed_byCoverage + idx_exceptToPredict
    
    Y = np.delete(dti, idx_removed, 0)
    X = np.delete(X_t, idx_removed, 0)
    
    ############# PRUNE AND LEAVE ONLY LINKS TO PREDICT, DONE 

    Y_categorical = copy.deepcopy(Y)
    Y_categorical += 1e-11

    # miss
    Y_categorical = np.where(Y_categorical > 7, 8, Y_categorical) # if 0 in original dti, convert to category 8
    
    ############# COLLECT INDICES
    
    num_scalesToPredict = min_scaleToPredict_c - max_scaleToPredict_c + 1
    print("num_scalesToPredict: " + str(num_scalesToPredict))
    
    categorical_indices = []
    for i in range(num_scalesToPredict):
        categorical_indices.append([])
#     print(categorical_indices)
    
    for i in range(len(Y_categorical)):
        idx_1 = Y_categorical[i] - max_scaleToPredict_c # index in indices array
        categorical_indices[idx_1].append(i)
        
    for i in range(num_scalesToPredict):
        print(np.shape(categorical_indices[i]))

    ############# COLLECT INDICES, DONE
    
    ############# K-FOLD
    
    K = 5
    kfold_indices = []
    for i in range(num_scalesToPredict):
        count_curScale = len(categorical_indices[i])
        fold_size = int(count_curScale/K) + 1
        kfold_indices.append(chunks(categorical_indices[i], fold_size))
#         print(count_curScale)
        
#     for i in range(num_scalesToPredict):
#         for j in range(K):
#             print(np.shape(kfold_indices[i][j]))
    
    # return lsp
    
    kfold_indices_rearranged = []
    for i in range(K):
        kfold_indices_rearranged.append([])
        
    for i in range(num_scalesToPredict):
        for j in range(K):
            kfold_indices_rearranged[j].append(kfold_indices[i][j])

    for i in range(K):
        for j in range(num_scalesToPredict):
            print(np.shape(kfold_indices_rearranged[i][j]))
    
    # return 

    kfold_flattened = []
    
    for i in range(K):
        fold = kfold_indices_rearranged[i]
        flat_fold = [x for sublist in fold for x in sublist]
        kfold_flattened.append(flat_fold)
    
    print("shape: kfold_flattened")
    for i in range(K):
        print(np.shape(kfold_flattened[i]))
    
    ############# K-FOLD, DONE

    axis = 0
    print("fold shapes")
    
    X_folds = []
    Y_folds = []
    Y_categorical_folds = []
    
    for i in range(K):
        fold_indices = kfold_flattened[i]
        X_folds.append(np.take(X, fold_indices, axis))
        Y_folds.append(np.take(Y, fold_indices, axis))
        Y_categorical_folds.append(np.take(Y_categorical, fold_indices, axis))
        print(np.shape(X_folds[i]))
    
    ############# SCALING
    for i in range(K):
        Y_folds[i] = scaling(Y_folds[i], scaling_params["apply_log"], scaling_params["beta"])
    ############# SCALING ,DONE
    
    print("shapes: X: " + str(np.shape(X)) + "   Y : " + str(np.shape(Y)))

#     binary array, 1 for weak links known
#     Correct answers
    
    output_folder = PATH
    output_folder += "/AvgData/"
    output_folder += process_folderName(scaling_params)
    print(output_folder)

    if path_dict["isTrain"] == True:
        output_folder += "/Train"
    else:
        output_folder += "/Test"
    print(output_folder)
    
    x_path = output_folder + "/X/X_"
    y_prob_path = output_folder + "/Y/Y_prob_"
    y_categorical_path = output_folder + "/Y/Y_categorical_"
    x_path += "avg"
    y_prob_path += "avg"
    y_categorical_path += "avg"

    print(x_path)
    print(y_prob_path)
    print(y_categorical_path)
    
    for i in range(K):
        x = np.savetxt(x_path + "-f" + str(i+1) + ".csv", X_folds[i], delimiter=",",fmt = "%s")
        y = np.savetxt(y_prob_path + "-f" + str(i+1) + ".csv", Y_folds[i], delimiter=",",fmt = "%s")
        y_categorical = np.savetxt(y_categorical_path+ "-f" + str(i+1) + ".csv", Y_categorical_folds[i], 
                                   delimiter=",",fmt = "%s")

In [1]:
def to_csv_dataset(sub_idx, subid_list, path_dict, scaling_params):
    if scaling_params["isGraph"]:
        to_csv_datataset_graph(sub_idx, subid_list, path_dict, scaling_params)
        return
    
    subject_ID = subid_list[sub_idx]
    PATH = path_dict["PATH"]
#     print("haha")
    
    distance_matrix = pd.read_csv(path_dict["distance_path"], header=None)

    dist_arr = distance_matrix.to_numpy()
    spatial_proximity = dist_arr.flatten()
    
    #################################### SHOULD NORMALIZE LATER !!!!!!!!!!!!! SUBJECT TO CHANGES
    spatial_proximity = (spatial_proximity - spatial_proximity.min()) / (spatial_proximity.max() - spatial_proximity.min()) 
    #normalize
#     print("lc")
    
    dti = pd.read_csv(PATH + "/dti_csv/dti_mat/" + str(subject_ID) + "_MMP_matrix.csv", header=None)
    dti_np = dti.to_numpy()
    dti = transpose_add_flatten(dti_np)
    print(np.shape(dti))
#     print("dt")

    structure_csv = "localStructures_" + str(subject_ID) + ".csv"
    structures = read_structure(PATH + "/structures_csv/" + structure_csv, 0)
    
#     if scaling_params["isAverage"]:
#         dti_avg = pd.read_csv(PATH + "/dti_998avg.csv", header=None)
#         dti_avg = dti_avg.to_numpy().flatten()
#         dti_mat_avg = flattened_to_matrix(dti_avg)
#         dti = transpose_add_flatten(dti_mat_avg)
        
#         structures = read_structure(PATH + "/structures.csv", 0)
#         print("avg!")

    dti_copy = copy.deepcopy(dti)

    if scaling_params["weigh_topo"] == True: 
        # apply log scale on weight to compute topological similarity
        dti_copy = dti_copy + 1e-09
        dti_copy = np.log(dti_copy)
        dti_copy = (dti_copy - dti_copy.min()) / (dti_copy.max() - dti_copy.min())
    
    max_scaleKnown = 1 / 10**(min(scaling_params["scales_known"]))
    min_scaleKnown = 1 / 10**(max(scaling_params["scales_known"]))
    max_scaleToPredict = 1 / 10**(min(scaling_params["scales_toPredict"]))
    min_scaleToPredict = 1 / 10**(max(scaling_params["scales_toPredict"]))
    if max(scaling_params["scales_toPredict"]) == 8:
        min_scaleToPredict = 0
    
    # Turn dti_copy to binary array with 1 for known links and 0 for unknown
    dti_copy = np.where(dti_copy <= 10 * max_scaleKnown, dti_copy, 0)
    dti_copy = np.where(dti_copy >= min_scaleKnown, dti_copy, 0)
    if scaling_params["weigh_topo"] == False:
        dti_copy = np.where(dti_copy > 0, 1, 0)

    topo_arr = backboneDTI_to_topo(dti_copy)
    topo_arr = topo_arr.flatten()
    ####
    X_t = np.column_stack((spatial_proximity, structures))
    X_t = np.column_stack((X_t, topo_arr))
    
    print("ymax_beforeRemovingStrongLinks_wholeBrain: " + str(dti.max()))
    
#     dti_copy = np.delete(dti_copy, idx_removed_byCoverage, 0)
#     print("ymax_beforeRemovingStrongLinks: " + str(dti.max()))

    idx_exceptToPredict = []
    for i in range(len(dti)):
        if (dti[i] < max_scaleToPredict*10 and dti[i] >= min_scaleToPredict) == False:
            idx_exceptToPredict.append(i)
    print(len(idx_exceptToPredict))
    
    idx_removed_byCoverage = removeByCoverage(scaling_params["coverage"])
    print(len(idx_removed_byCoverage))
    
    
    idx_removed = idx_removed_byCoverage + idx_exceptToPredict
    
    Y = np.delete(dti, idx_removed, 0)
    X = np.delete(X_t, idx_removed, 0)
    #this

    Y_categorical = copy.deepcopy(Y)
    Y_categorical+= 1e-11
    Y_categorical = -np.floor(np.log10(Y_categorical)).astype(int)
    Y_categorical = np.where(Y_categorical > 7, 8, Y_categorical) # if 0 in original dti
    
    if scaling_params["apply_log"] == False:
        Y = Y ** scaling_params["beta"]
        print("beta:  " +  str(scaling_params["beta"])   + "Y_max_beta: " + str(Y.max()))
    else:
        Y = Y + 1e-09
        Y = np.log(Y)
        print("log:  " + "Y_max_log: " + str(Y.max()))
    
    Y = (Y - Y.min()) / (Y.max() - Y.min())

    print("shapes: X: " + str(np.shape(X)) + "   Y : " + str(np.shape(Y)))
    
#     binary array, 1 for weak links known
#     Correct answers
    
    output_folder = PATH
    if scaling_params["isAverage"]:
        output_folder += "/AvgData/"
    else:
        output_folder += "/Data/"
    
    output_folder += process_folderName(scaling_params)
    print(output_folder)

    if path_dict["isTrain"] == True:
        output_folder += "/Train"
    else:
        output_folder += "/Test"
#     output_folder += "/Train"
    print(output_folder)
    
    x_path = output_folder + "/X/X_"
    y_prob_path = output_folder + "/Y/Y_prob_"
    y_categorical_path = output_folder + "/Y/Y_categorical_"
    
    if scaling_params["isAverage"]:
        x_path += "avg.csv"
        y_prob_path += "avg.csv"
        y_categorical_path += "avg.csv"
    else:
        x_path += str(subject_ID) + ".csv"
        y_prob_path += str(subject_ID) + ".csv"
        y_categorical_path += str(subject_ID) + ".csv"
    
    print("+1")
    print(x_path)
    print(y_prob_path)
    print(y_categorical_path)
    x = np.savetxt(x_path, X, delimiter=",",fmt = "%s")
    y = np.savetxt(y_prob_path, Y, delimiter=",",fmt = "%s")
    y_categorical = np.savetxt(y_categorical_path, Y_categorical, delimiter=",",fmt = "%s")


In [16]:
def merge_csv(folder, cur_dir):
    subfolders = ['X', 'Y', 'Y']
    prefix = ['X', 'Y_prob','Y_categorical']
    
    for i in range(3):
        x_path = cur_dir + "/" + folder + "/" + subfolders[i] + "/"
        all_x_files = glob.glob(os.path.join(x_path, prefix[i] + "_*.csv"))
        sorted_x_files = sorted(all_x_files)
#         print(sorted_x_files)

        for x_file in sorted_x_files:
            sub_id = int(''.join(filter(str.isdigit, x_file)))
#             print(sub_id)

        df_from_each_file = (pd.read_csv(f, sep=',',header=None) for f in sorted_x_files)
        df_merged = pd.concat(df_from_each_file, ignore_index=True)
        saved_arr = df_merged.to_numpy()
        merged = np.savetxt(cur_dir + "/" + folder + "/" + prefix[i] + 
                            ".csv", saved_arr, delimiter=",",fmt = "%s")
        
#         df_merged.to_csv(cur_dir + "/" + folder + "/" + subfolders[i] + "_" + folder + ".csv")

In [17]:
def create_dataFolder(folder_name):
    folder = cur_dir + folder_name
    train_path = folder + "/Train"
    test_path = folder + "/Test"

    try:
        os.mkdir(folder)
        os.mkdir(train_path)
        os.mkdir(train_path + "/X")
        os.mkdir(train_path + "/Y")
        os.mkdir(test_path)
        os.mkdir(test_path + "/X")
        os.mkdir(test_path + "/Y")
    except OSError:
        print ("Creation of the directory %s failed" % folder_name)
    else:
        print ("Successfully created the directory %s " % folder_name)

In [18]:
def process_folderName(scaling_params):
    folder_name = scaling_params["coverage"] + "_"
    scales_known = scaling_params["scales_known"]
    scales_toPredict = scaling_params["scales_toPredict"]
    
    for i in scales_known:
        folder_name += str(i)
    folder_name += "to"
    for j in scales_toPredict:
        folder_name += str(j)
    folder_name += "_"
    
    if scaling_params["apply_log"] == True:
        folder_name += "Log"
    else:
        folder_name += "beta" + str(scaling_params["beta"])
        
    if scaling_params["weigh_topo"] == True:
        folder_name += "_wcm" 
    
    return folder_name

In [21]:
def to_dataset(subid_list, path_dict, scaling_params, num_individuals):
    new_foldername = process_folderName(scaling_params)

    start = timer()

    if scaling_params["isAverage"]:
        if not scaling_params["isGraph"]:
            create_dataFolder("/AvgData/" + new_foldername)
            to_csv_dataset_avg(path_dict, scaling_params)
            return
        else:
            create_dataFolder("/GraphAvgData/" + new_foldername)
            to_csv_dataset(0, subid_list, path_dict, scaling_params)
            return
    
    if scaling_params["isGraph"]:
        if not scaling_params["isAverage"]:
            create_dataFolder("/GraphData/" + new_foldername)
    else:
        create_dataFolder("/Data/" + new_foldername)
    
    for sub_idx in range(num_individuals):
        if sub_idx % 10 != 0:
            path_dict["isTrain"] = True
            to_csv_dataset(sub_idx, subid_list, path_dict, scaling_params)
        else:
            path_dict["isTrain"] = False
            to_csv_dataset(sub_idx, subid_list, path_dict, scaling_params)
            
        print("Individual No." + str(int(sub_idx + 1)))
        end = timer()
        print(timedelta(seconds=end-start))
    
    if not scaling_params["isGraph"]:
        merge_csv('Data/' + new_foldername + "/Train", cur_dir)
        print("Done merging")
#     merge_csv('Data/' + new_foldername + "/Test", cur_dir)