In [1]:
import numpy as np
import os
import shutil
import numpy as np

from dataTransform import *
from Maglib import *

# Path Setting

In [2]:
raw_data_path = r"C:\Users\ossia\Downloads\pre-training\pre-training" # Set path to folder containing unzipped MagNet material datasets
data_dir = "preprocessed_training_dataset_2" # Specify directory to store downsampled, split (train, test, validate) material data

# Create a folder if it does not already exist
if not os.path.exists(data_dir):
    raise RuntimeError('Directory specified by data_dir does not exist, ensure data_dir points to a folder where the processed data can be stored.')

processed_data_dir = os.path.join(data_dir, 'Processed Training Data') # Directory where processed training data will be saved
os.makedirs(processed_data_dir, exist_ok=True)

training_materials = []

# Create list of training materials in raw directory
for item in os.listdir(raw_data_path):
    item_path = os.path.join(raw_data_path, item)
    if os.path.isdir(item_path):
        training_materials.append(item)

print("Training Materials:", training_materials)

# Save the list of training materials to a text file
with open("training_materials.txt", 'w') as file:
    for mat in training_materials:
        file.write(mat + '\n')
print(f"List of training materials have been saved to cwd")

New directory at 'preprocessed_training_dataset_2' created
Training Materials: ['3C90', '3C94', '3E6', '3F4', '77', '78', 'N27', 'N30', 'N49', 'N87']
List of training materials have been saved to cwd


# Data Pre-Processing

In [3]:
for mat in training_materials: # Will iterate through all materials present in provided training dataset folder, only processing  materials not present in output folder
    processed_mat_path = os.path.join(processed_data_dir, mat) # Create folder for processed material data
    try:
        os.makedirs(processed_mat_path) # Create the new subfolder for material
        print(f"Subfolder '{mat}' created in '{processed_data_dir}'")
    except FileExistsError:
        print(f"Preprocessed data for '{mat}' already exists in '{processed_data_dir}', skipping this material")
        continue
    
    unprocessed_mat_path = os.path.join(raw_data_path, mat)
    raw_data = Maglib.MagLoader(
        unprocessed_mat_path,
        data_source='csv'
    )
        
    # Ensures all arrays are 2 dimensional
    def ensure_2d(arr): 
        if arr.ndim == 1:
            return arr[:, np.newaxis]
        return arr
    raw_data.temp = ensure_2d(raw_data.temp)
    raw_data.loss = ensure_2d(raw_data.loss)
    raw_data.freq = ensure_2d(raw_data.freq)
    
    newStep=128 # Resampled length 
    raw_data=dataTransform(raw_data, newStep, processed_mat_path, plot=False)
    dataSplit(raw_data, processed_mat_path)
    print("Rescaled, downsampled and split material:", mat)

Subfolder '3C90' created in preprocessed_training_dataset_2
Data transform done
DataSplit done
Rescaled, downsampled and split material: 3C90
Subfolder '3C94' created in preprocessed_training_dataset_2
Data transform done
DataSplit done
Rescaled, downsampled and split material: 3C94
Subfolder '3E6' created in preprocessed_training_dataset_2
Data transform done
DataSplit done
Rescaled, downsampled and split material: 3E6
Subfolder '3F4' created in preprocessed_training_dataset_2
Data transform done
DataSplit done
Rescaled, downsampled and split material: 3F4
Subfolder '77' created in preprocessed_training_dataset_2
Data transform done
DataSplit done
Rescaled, downsampled and split material: 77
Subfolder '78' created in preprocessed_training_dataset_2
Data transform done
DataSplit done
Rescaled, downsampled and split material: 78
Subfolder 'N27' created in preprocessed_training_dataset_2
Data transform done
DataSplit done
Rescaled, downsampled and split material: N27
Subfolder 'N30' crea