In [1]:
import pandas as pd

In [13]:
# Specify location of csv superset:
csv_file = "/home/vincent/Documenten/BachelorsProject/GitHub_Repo/data_prep/subset_data.csv"

# Target name for dataset:
datasetName = "fullsize"

# Specify what fraction (<= 1) of the superset to use for the subsets:
frac = 0.1
assert frac > 0 and frac <= 1, "frac must be between 0 and 1!"

# Specify what portions are used for the training, validation and testing set:
split = {
    "train": 0.8,   # You can specify as many subsets as you like!
    "val"  : 0.1,   # Just make sure they add up to 1
    "test" : 0.1
}
assert sum(split.values()) == 1 and \
    not any([x < 0 for x in split.values()]), \
    "splits must sum to 1 and all be positive!"

# Getting the dataframe:
superset = pd.read_csv(csv_file, index_col=False)

In [9]:
def getSubsets(fracs, superset: pd.DataFrame):
    """
    If fracs is float, returns sample of 'superset' that is 'frac' times the size
    If fracs is a dictionary with floats as values, returns mutually exclusive subsets with
        the floats being the fraction of the superset per subset
    """
    
    # Convert all inputs to dicts
    fracs = fracs if isinstance(fracs, dict) else {"whole": fracs}
    
    # Create empty dataframes for each key
    subsets = {key: pd.DataFrame(columns=superset.columns, index=None) for key in fracs}

    # Iterate over all materials to make random n-way split for each
    for material in superset["material"].unique():
        # Random permutation of df containing all instances of material:
        tmpDf = superset[superset["material"] == material].sample(frac=1.0)
        
        # Splitting it up into ranges df[0:x], df[x:y], df[y:z], etc
        fracSum = 0.0
        end = 0
        for key in fracs:
            start = end
            fracSum += fracs[key]
            end = int(fracSum * len(tmpDf))
            if start == end:
                print("WARNING: No instances of {material} in {key} set")
            subsets[key] = pd.concat([subsets[key], tmpDf.iloc[start:end, :]])
            start = end
    
    # Each subset should randomly be permuted, and indeces should be reset:
    for key in subsets:
        subsets[key] = subsets[key].sample(frac=1.0)

    return subsets

In [14]:
# Getting a fraction of the superset:
subset = getSubsets(frac, superset)["whole"]

# Getting 3 splits of the subset:
subsets = getSubsets(split, subset)

# Saving each to a csv-file:
for ss in subsets:
    subsets[ss].to_csv(f"{datasetName}-{ss}.csv", index=False)