Import relevant libraries

In [None]:
import numpy as np
import os
import numpy as np

from dataTransform import *
from Maglib import *

**Specify relevant directories**

*raw_data_path* = folder containing unzipped MagNet material csv datasets, where each material contains at least the following csvs, each in the MagNet format (see https://www.princeton.edu/~minjie/magnet.html):
-	B_waveform[T].csv
-	Frequency[Hz].csv
-	Temperature[C].csv
-	Volumetric_losses[Wm-3].csv

*data_dir* = Specify the directory to store downsampled, split (train, test, validate) material data. This directory will also be used to store other relevant outputs in the training and verification pipelines.


In [None]:
raw_data_path = r"C:\Users\ossia\Downloads\pre-training\pre-training" 
data_dir = "preprocessed_training_dataset_2"

if not os.path.exists(data_dir): # Ensure data_dir exists
    raise RuntimeError('Directory specified by data_dir does not exist, ensure data_dir points to a folder where the processed data can be stored.')

processed_data_dir = os.path.join(data_dir, 'Processed Training Data') # Directory where processed training data will be saved
os.makedirs(processed_data_dir, exist_ok=True)

training_materials = []

# Create list of training materials in raw directory
for item in os.listdir(raw_data_path):
    item_path = os.path.join(raw_data_path, item)
    if os.path.isdir(item_path):
        training_materials.append(item)

print("Training Materials:", training_materials)

# Save the list of training materials to a text file
with open("training_materials.txt", 'w') as file:
    for mat in training_materials:
        file.write(mat + '\n')
print(f"List of training materials have been saved to cwd")

**Resample, rescale and split data**

*newStep* = Number of samples for resampled B timeseries, this must only be adjusted if also adjusting the layer dimensions in *NW_LSTM.py* and modifying the value of newStep within MagLoss in *MagNet.py* when using MagLoss to infer losses from models with modified sample numbers. 

In [None]:
newStep=128

for mat in training_materials: # Will iterate through all materials present in provided training dataset folder, only processing  materials not present in output folder
    processed_mat_path = os.path.join(processed_data_dir, mat) # Create folder for processed material data
    try:
        os.makedirs(processed_mat_path) # Create the new subfolder for material
        print(f"Subfolder '{mat}' created in '{processed_data_dir}'")
    except FileExistsError:
        print(f"Preprocessed data for '{mat}' already exists in '{processed_data_dir}', skipping this material")
        continue
    
    unprocessed_mat_path = os.path.join(raw_data_path, mat)
    raw_data = Maglib.MagLoader(
        unprocessed_mat_path,
        data_source='csv'
    )
        
    # Ensures all arrays are 2 dimensional
    def ensure_2d(arr): 
        if arr.ndim == 1:
            return arr[:, np.newaxis]
        return arr
    raw_data.temp = ensure_2d(raw_data.temp)
    raw_data.loss = ensure_2d(raw_data.loss)
    raw_data.freq = ensure_2d(raw_data.freq)
    
    raw_data=dataTransform(raw_data, newStep, processed_mat_path, plot=False)
    dataSplit(raw_data, processed_mat_path)
    print("Rescaled, downsampled and split material:", mat)