 this code reads data from multiple .int files, samples a specified number of records from each file, saves the samples as separate .npy files

In [1]:
import os
import numpy as np
import pandas as pd
import random

def read_int_files(folder_path, sensors, mmap_dir, sample_size):
    for folder_name in os.listdir(folder_path):
        if folder_name.startswith("Yaw"):
            folder_dir = os.path.join(folder_path, folder_name)
            files = []
            file_handles = [] 

            for file_name in os.listdir(folder_dir):
                if file_name.endswith(".int"):
                    file_path = os.path.join(folder_dir, file_name)
                    files.append(file_path)

                    # Open the file and store the file handle
                    file_handle = open(file_path, "rb")
                    file_handles.append(file_handle)

            dfs = [] 
            sensor_labels = []  

            for f in file_handles:
                leer = np.frombuffer(f.read(4), dtype=np.single)
                datei = np.frombuffer(f.read(24), dtype=np.uint32)
                f.seek(76)
                lvek = np.frombuffer(f.read(4), dtype=np.uint32)
                f.seek(84 + lvek[0] * 4)
                filetype = np.frombuffer(f.read(4), dtype=np.uint32)  # Fixed the variable declaration

                if filetype != 12:
                    t0 = np.frombuffer(f.read(4), dtype=np.single)
                    dt = np.frombuffer(f.read(4), dtype=np.single)
                    pos_fak = f.tell()
                    fak = []

                    for n in sensors:
                        f.seek(pos_fak + 4 * (n - 1))
                        fak.append(np.frombuffer(f.read(4), dtype=np.single))

                    f.seek(pos_fak + lvek[0] * 4)
                    position = f.tell()

                    # Calculate the file size
                    f.seek(0, os.SEEK_END)
                    file_size = f.tell()

                    record_count = round((file_size - position) / lvek[0] / 2)
                    sample_size = min(sample_size, record_count)

                    sampled_indices = random.sample(range(record_count), sample_size)

                    raw_ts_int = np.empty((sample_size, len(sensors)))

                    for i, n in enumerate(sampled_indices):
                        for m, sensor_ in enumerate(sensors):
                            f.seek(position + 2 * sensor_ + (lvek[0] * 2) * n - 2)
                            raw_ts_int[i, m] = np.frombuffer(f.read(2), dtype=np.int16) * fak[m]

                    for i, sample_index in enumerate(sampled_indices):
                        sample = raw_ts_int[i]
                        combined_sample_dir = os.path.join(mmap_dir, folder_name)
                        os.makedirs(combined_sample_dir, exist_ok=True)
                        combined_sample_path = os.path.join(combined_sample_dir, f"{folder_name}_combined_sample_{i}.npy")
                        np.save(combined_sample_path, sample)

                    dfs.append(raw_ts_int)

                    sensor_labels.extend([f"Sensor {sensor}" for sensor in sensors])

            for f in file_handles:
                f.close()

            combined_array = np.concatenate(dfs, axis=0) 

    return combined_array,sensor_labels

folder_path = r"C:\Users\musab\raw"
sensors = (1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 45, 53, 54, 56, 57, 59, 60, 61, 62, 63, 64)
mmap_dir = r"C:\Users\musab\mmap_files_combined151" 
sample_size = 100

os.makedirs(mmap_dir, exist_ok=True)

result_array, sensor_labels = read_int_files(folder_path, sensors, mmap_dir, sample_size)

result_array = result_array.reshape((-1, 21))

df = pd.DataFrame(result_array, columns=sensor_labels[:21])

print(df)


     Sensor 1  Sensor 4      Sensor 5      Sensor 6      Sensor 7    Sensor 8  \
0    3.992367 -4.779067 -7.637931e-14  6.130064e-13 -6.996445e-13   36.298584   
1    4.225396 -1.770702 -9.411864e-14  3.960782e-12 -3.764165e-12   60.426403   
2    4.446765 -2.959167 -3.547129e-14  2.628997e-12 -2.445627e-12  161.617004   
3    3.857340 -5.752445 -4.938724e-14  1.820233e-12 -1.750005e-12  194.192368   
4    3.718689 -6.661582 -6.282434e-14  4.987248e-12 -4.633953e-12  168.591019   
..        ...       ...           ...           ...           ...         ...   
495  5.002016 -7.027889 -1.040392e-13  4.398606e-12 -4.094301e-12  209.961868   
496  4.550631 -6.207134 -2.670303e-14  4.874060e-13 -4.932421e-13  119.819115   
497  5.008232 -3.571974 -6.758943e-14  1.787374e-12 -1.717018e-12   74.899605   
498  5.120819 -4.457515 -1.415080e-13  4.764528e-12 -4.483267e-12   45.650734   
499  4.271235 -3.508380 -6.899668e-14  1.105955e-12 -1.122810e-12   44.593273   

       Sensor 9  Sensor 10 

additional steps for data sampling, saving, and shuffling 

In [None]:
import os
import numpy as np
import pandas as pd
import random

def read_int_files(folder_path, sensors, mmap_dir, sample_size):
    for folder_name in os.listdir(folder_path):
        if folder_name.startswith("Yaw"):
            folder_dir = os.path.join(folder_path, folder_name)
            files = []
            file_handles = []  

            for file_name in os.listdir(folder_dir):
                if file_name.endswith(".int"):
                    file_path = os.path.join(folder_dir, file_name)
                    files.append(file_path)

                    file_handle = open(file_path, "rb")
                    file_handles.append(file_handle)

            dfs = [] 
            sensor_labels = []  

            for f in file_handles:
                leer = np.frombuffer(f.read(4), dtype=np.single)
                datei = np.frombuffer(f.read(24), dtype=np.uint32)
                f.seek(76)
                lvek = np.frombuffer(f.read(4), dtype=np.uint32)
                f.seek(84 + lvek[0] * 4)
                filetype = np.frombuffer(f.read(4), dtype=np.uint32) 

                if filetype != 12:
                    t0 = np.frombuffer(f.read(4), dtype=np.single)
                    dt = np.frombuffer(f.read(4), dtype=np.single)
                    pos_fak = f.tell()
                    fak = []

                    for n in sensors:
                        f.seek(pos_fak + 4 * (n - 1))
                        fak.append(np.frombuffer(f.read(4), dtype=np.single))

                    f.seek(pos_fak + lvek[0] * 4)
                    position = f.tell()

                    f.seek(0, os.SEEK_END)
                    file_size = f.tell()

                    record_count = round((file_size - position) / lvek[0] / 2)

                    sample_size = min(sample_size, record_count)
                    sampled_indices = random.sample(range(record_count), sample_size)

                    raw_ts_int = np.empty((sample_size, len(sensors)))

                    for i, n in enumerate(sampled_indices):
                        for m, sensor_ in enumerate(sensors):
                            f.seek(position + 2 * sensor_ + (lvek[0] * 2) * n - 2)
                            raw_ts_int[i, m] = np.frombuffer(f.read(2), dtype=np.int16) * fak[m]

                    dfs.extend(raw_ts_int)

                    sensor_labels.extend([f"Sensor {sensor}" for sensor in sensors] * sample_size)

            for f in file_handles:
                f.close()

    combined_array = np.concatenate(dfs, axis=0) 

    return combined_array, sensor_labels

folder_path = r"C:\Users\musab\raw"
sensors = (1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 45, 53, 54, 56, 57, 59, 60, 61, 62, 63, 64)
mmap_dir = r"C:\Users\musab\mmap_files_combined1" 
sample_size = 1000 

os.makedirs(mmap_dir, exist_ok=True)

result_array, sensor_labels = read_int_files(folder_path, sensors, mmap_dir, sample_size)

result_array = result_array.reshape((-1, len(sensors)))

folder_indices = np.arange(len(result_array)) // sample_size
np.random.shuffle(folder_indices)
selected_indices = [np.where(folder_indices == i)[0][:3] for i in np.unique(folder_indices)]
selected_indices = np.concatenate(selected_indices)

selected_samples = result_array[selected_indices].reshape((-1, len(sensors)))

for i, folder_index in enumerate(np.unique(folder_indices)):
    folder_name = f"Yaw_{folder_index}"
    folder_dir = os.path.join(mmap_dir, folder_name)
    os.makedirs(folder_dir, exist_ok=True)
    for j, sample_index in enumerate(selected_indices[i * 3 : (i + 1) * 3]):
        sample = selected_samples[j]
        sample_path = os.path.join(folder_dir, f"{folder_name}_sample_{j}.npy")
        np.save(sample_path, sample)

selected_df = pd.DataFrame(selected_samples, columns=sensor_labels[:len(sensors)])

print(selected_df)


## 3 sample per file 

In [None]:
import os
import numpy as np
import pandas as pd
import random

def read_int_files(folder_path, sensors, mmap_dir, sample_size, num_samples_per_file):
    for folder_name in os.listdir(folder_path):
        if folder_name.startswith("Yaw"):
            folder_dir = os.path.join(folder_path, folder_name)
            files = []
            file_handles = [] 
            file_count = 1

            for file_name in os.listdir(folder_dir):
                if file_name.endswith(".int"):
                    file_path = os.path.join(folder_dir, file_name)
                    files.append(file_path)

                    file_handle = open(file_path, "rb")
                    file_handles.append(file_handle)

            dfs = [] 
            sensor_labels = []  

            for f in file_handles:
                leer = np.frombuffer(f.read(4), dtype=np.single)
                datei = np.frombuffer(f.read(24), dtype=np.uint32)
                f.seek(76)
                lvek = np.frombuffer(f.read(4), dtype=np.uint32)
                f.seek(84 + lvek[0] * 4)
                filetype = np.frombuffer(f.read(4), dtype=np.uint32) 

                if filetype != 12:
                    t0 = np.frombuffer(f.read(4), dtype=np.single)
                    dt = np.frombuffer(f.read(4), dtype=np.single)
                    pos_fak = f.tell()
                    fak = []

                    for n in sensors:
                        f.seek(pos_fak + 4 * (n - 1))
                        fak.append(np.frombuffer(f.read(4), dtype=np.single))

                    f.seek(pos_fak + lvek[0] * 4)
                    position = f.tell()

                    # Calculate the file size
                    f.seek(0, os.SEEK_END)
                    file_size = f.tell()

                    record_count = round((file_size - position) / lvek[0] / 2)

                    sampled_indices = random.sample(range(record_count), num_samples_per_file)

                    for sample_idx, n in enumerate(sampled_indices, start=1):
                        raw_ts_int = np.empty((sample_size, len(sensors)))

                        for i, m in enumerate(sensors):
                            f.seek(position + 2 * m + (lvek[0] * 2) * n - 2)
                            raw_ts_int[:, i] = np.frombuffer(f.read(2 * sample_size), dtype=np.int16) * fak[i]

                        dfs.append(raw_ts_int)

                        sensor_labels.extend([f"Sensor {sensor}_{sample_idx}" for sensor in sensors])

                file_count += 1

            for f in file_handles:
                f.close()

            combined_array = np.concatenate(dfs, axis=0) 

            combined_mmap_path = os.path.join(mmap_dir, f"{folder_name}_combine")
            os.makedirs(combined_mmap_path, exist_ok=True)

            for i, data in enumerate(combined_array):
                sample_file_path = os.path.join(combined_mmap_path, f"{folder_name}_combine_{i+1}.npy")
                np.save(sample_file_path, data)

    return combined_array, sensor_labels

folder_path = r"C:\Users\musab\raw"
sensors = (1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 45, 53, 54, 56, 57, 59, 60, 61, 62, 63, 64)
mmap_dir = r"C:\Users\musab\mmap_files_combined11" 
sample_size = 1000 
num_samples_per_file = 3

os.makedirs(mmap_dir, exist_ok=True)

result_array, sensor_labels = read_int_files(folder_path, sensors, mmap_dir, sample_size, num_samples_per_file)

result_array = result_array.reshape((-1, len(sensors)))

df = pd.DataFrame(result_array, columns=sensor_labels[:len(sensors)])

print(df)


## 3 sample per folder 

In [2]:
import os
import numpy as np
import pandas as pd
import random

def read_int_files(folder_path, sensors, mmap_dir, sample_size, num_samples_per_folder):
    for folder_name in os.listdir(folder_path):
        if folder_name.startswith("Yaw"):
            folder_dir = os.path.join(folder_path, folder_name)
            files = []
            file_handles = []
            file_count = 1

            for file_name in os.listdir(folder_dir):
                if file_name.endswith(".int"):
                    file_path = os.path.join(folder_dir, file_name)
                    files.append(file_path)

                    file_handle = open(file_path, "rb")
                    file_handles.append(file_handle)

            dfs = [] 
            sensor_labels = []  
            sampled_files = random.sample(file_handles, num_samples_per_folder)

            for f in sampled_files:
                leer = np.frombuffer(f.read(4), dtype=np.single)
                datei = np.frombuffer(f.read(24), dtype=np.uint32)
                f.seek(76)
                lvek = np.frombuffer(f.read(4), dtype=np.uint32)
                f.seek(84 + lvek[0] * 4)
                filetype = np.frombuffer(f.read(4), dtype=np.uint32)  # Fixed the variable declaration

                if filetype != 12:
                    t0 = np.frombuffer(f.read(4), dtype=np.single)
                    dt = np.frombuffer(f.read(4), dtype=np.single)
                    pos_fak = f.tell()
                    fak = []

                    for n in sensors:
                        f.seek(pos_fak + 4 * (n - 1))
                        fak.append(np.frombuffer(f.read(4), dtype=np.single))

                    f.seek(pos_fak + lvek[0] * 4)
                    position = f.tell()

                    f.seek(0, os.SEEK_END)
                    file_size = f.tell()

                    record_count = round((file_size - position) / lvek[0] / 2)

                    sampled_indices = random.sample(range(record_count), 1)

                    for sample_idx, n in enumerate(sampled_indices, start=1):
                        raw_ts_int = np.empty((sample_size, len(sensors)))

                        for i, m in enumerate(sensors):
                            f.seek(position + 2 * m + (lvek[0] * 2) * n - 2)
                            raw_ts_int[:, i] = np.frombuffer(f.read(2 * sample_size), dtype=np.int16) * fak[i]

                        dfs.append(raw_ts_int)

                        sensor_labels.extend([f"Sensor {sensor}_{sample_idx}" for sensor in sensors])

                file_count += 1

            for f in file_handles:
                f.close()

            combined_array = np.concatenate(dfs, axis=0) 

            combined_mmap_path = os.path.join(mmap_dir, f"{folder_name}_combine")
            os.makedirs(combined_mmap_path, exist_ok=True)

            for i, data in enumerate(combined_array):
                for sample_idx in range(num_samples_per_folder):
                    sample_file_path = os.path.join(combined_mmap_path, f"{folder_name}_combine_{sample_idx+1}.npy")
                    np.save(sample_file_path, data)

    return combined_array, sensor_labels

folder_path = r"C:\Users\musab\raw"
sensors= (1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 45, 53, 54, 56, 57, 59, 60, 61, 62, 63, 64)
mmap_dir = r"C:\Users\musab\mmap_files_combined151" 
sample_size = 1000 
num_samples_per_folder = 3

os.makedirs(mmap_dir, exist_ok=True)

result_array, sensor_labels = read_int_files(folder_path, sensors, mmap_dir, sample_size, num_samples_per_folder)

result_array = result_array.reshape((-1, len(sensors)))

df = pd.DataFrame(result_array, columns=sensor_labels[:len(sensors)])

print(df)


     Sensor 1_1  Sensor 4_1    Sensor 5_1    Sensor 6_1    Sensor 7_1  \
0      4.017891   -5.603633 -9.212223e-14  3.344145e-12 -3.201738e-12   
1     -0.379873   -5.084417  1.560516e-13 -3.392611e-12  2.663893e-13   
2      5.041862    8.612811 -1.583132e-13  2.822702e-13  2.556325e-12   
3     -2.171467   -8.737634  1.317190e-14  2.708721e-12  2.556622e-12   
4     -1.970265    0.726984  1.264001e-13  2.709037e-12  2.887374e-13   
..          ...         ...           ...           ...           ...   
995    2.211487    4.440784  8.720119e-14 -1.411351e-12  1.400183e-12   
996    1.033897    4.812814 -6.585947e-14  1.483655e-12  1.751495e-12   
997    0.099419   -3.634921  6.923348e-14  1.855911e-12  2.073755e-12   
998    1.720851    3.821139  8.660447e-14  2.197382e-12 -3.043066e-12   
999    1.865016    4.779880  1.025389e-13 -3.224480e-12  1.474826e-12   

     Sensor 8_1  Sensor 9_1  Sensor 10_1  Sensor 11_1  Sensor 12_1  ...  \
0     20.112139  608.572815     7.655896     0.9

, a fixed number of files (num_samples_per_folder) are randomly selected from the folder.

In [2]:
import os
import numpy as np
import pandas as pd
import random

def read_int_files(folder_path, sensors, mmap_dir, sample_size, num_samples_per_folder):
    for folder_name in os.listdir(folder_path):
        if folder_name.startswith("Yaw"):
            folder_dir = os.path.join(folder_path, folder_name)
            files = []
            file_handles = []
            file_count = 1

            for file_name in os.listdir(folder_dir):
                if file_name.endswith(".int"):
                    file_path = os.path.join(folder_dir, file_name)
                    files.append(file_path)

                    file_handle = open(file_path, "rb")
                    file_handles.append(file_handle)

            dfs = [] 
            sensor_labels = []  
            sampled_files = random.sample(file_handles, num_samples_per_folder)

            for f in sampled_files:
                leer = np.frombuffer(f.read(4), dtype=np.single)
                datei = np.frombuffer(f.read(24), dtype=np.uint32)
                f.seek(76)
                lvek = np.frombuffer(f.read(4), dtype=np.uint32)
                f.seek(84 + lvek[0] * 4)
                filetype = np.frombuffer(f.read(4), dtype=np.uint32) 

                if filetype != 12:
                    t0 = np.frombuffer(f.read(4), dtype=np.single)
                    dt = np.frombuffer(f.read(4), dtype=np.single)
                    pos_fak = f.tell()
                    fak = []

                    for n in sensors:
                        f.seek(pos_fak + 4 * (n - 1))
                        fak.append(np.frombuffer(f.read(4), dtype=np.single))

                    f.seek(pos_fak + lvek[0] * 4)
                    position = f.tell()

                    f.seek(0, os.SEEK_END)
                    file_size = f.tell()

                    record_count = round((file_size - position) / lvek[0] / 2)

                    max_index = record_count - sample_size
                    sampled_indices = random.sample(range(max_index), num_samples_per_folder)

                    for sample_idx, start_idx in enumerate(sampled_indices, start=1):
                        indices = list(range(start_idx, start_idx + sample_size))
                        raw_ts_int = np.empty((sample_size, len(sensors)))

                        for i, m in enumerate(sensors):
                            f.seek(position + 2 * m + (lvek[0] * 2) * start_idx - 2)
                            raw_ts_int[:, i] = np.frombuffer(f.read(2 * sample_size), dtype=np.int16) * fak[i]

                        dfs.append(raw_ts_int)

                        sensor_labels.extend([f"Sensor {sensor}_{sample_idx}" for sensor in sensors])

                file_count += 1

            for f in file_handles:
                f.close()

            combined_array = np.concatenate(dfs, axis=0) 

            combined_mmap_path = os.path.join(mmap_dir, f"{folder_name}_combine")
            os.makedirs(combined_mmap_path, exist_ok=True)

            for i, data in enumerate(combined_array):
                for sample_idx in range(num_samples_per_folder):
                    sample_file_path = os.path.join(combined_mmap_path, f"{folder_name}_combine_{sample_idx+1}.npy")
                    np.save(sample_file_path, data)

    return combined_array, sensor_labels

folder_path = r"C:\Users\musab\raw"
sensors = (1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 45, 53, 54, 56, 57, 59, 60, 61, 62, 63, 64)
mmap_dir = r"C:\Users\musab\mmap_files_combined12" 
sample_size = 1000 
num_samples_per_folder = 3

os.makedirs(mmap_dir, exist_ok=True)

result_array, sensor_labels = read_int_files(folder_path, sensors, mmap_dir, sample_size, num_samples_per_folder)

result_array = result_array.reshape((-1, len(sensors)))

df = pd.DataFrame(result_array, columns=sensor_labels[:len(sensors)])

print(df)


      Sensor 1_1  Sensor 4_1    Sensor 5_1    Sensor 6_1    Sensor 7_1  \
0       3.915316   -5.219892  1.486593e-14  6.499208e-13 -5.723721e-13   
1      -0.142341    0.822325  2.980395e-14 -6.037000e-13  1.489509e-12   
2       5.038611    1.648639 -2.768436e-14  1.571035e-12  2.566433e-12   
3      -2.060949   -1.531392  7.204423e-14  2.706903e-12  2.567477e-12   
4       0.324676    3.985207  1.241327e-13  2.708003e-12  3.455101e-13   
...          ...         ...           ...           ...           ...   
8995    2.629901   -7.493807  1.398640e-13  3.227000e-12  3.205720e-12   
8996   -3.281252    8.264119  1.471857e-13  3.362684e-12  2.841801e-12   
8997    1.228610    8.696732  1.533743e-13  2.980947e-12  2.658493e-12   
8998   -2.658005    9.062397  1.359630e-13  2.788663e-12  7.065534e-13   
8999    2.931229    8.033619  1.271928e-13  7.411488e-13  2.497817e-12   

      Sensor 8_1  Sensor 9_1  Sensor 10_1  Sensor 11_1  Sensor 12_1  ...  \
0     112.421150  610.700928     7.

The main goal of this code is to read and process data from selected files in a folder, sample consecutive segments of data from each file,

In [None]:
import os
import numpy as np
import pandas as pd
import random

def read_int_files(folder_path, sensors, mmap_dir, sample_size, num_samples_per_folder):
    for folder_name in os.listdir(folder_path):
        if folder_name.startswith("Yaw"):
            folder_dir = os.path.join(folder_path, folder_name)
            files = []
            file_handles = []
            file_count = 1

            for file_name in os.listdir(folder_dir):
                if file_name.endswith(".int"):
                    file_path = os.path.join(folder_dir, file_name)
                    files.append(file_path)

                    file_handle = open(file_path, "rb")
                    file_handles.append(file_handle)

            dfs = [] 
            sensor_labels = []  
            sampled_files = random.sample(file_handles, num_samples_per_folder)

            for f in sampled_files:
                leer = np.frombuffer(f.read(4), dtype=np.single)
                datei = np.frombuffer(f.read(24), dtype=np.uint32)
                f.seek(76)
                lvek = np.frombuffer(f.read(4), dtype=np.uint32)
                f.seek(84 + lvek[0] * 4)
                filetype = np.frombuffer(f.read(4), dtype=np.uint32)

                if filetype != 12:
                    t0 = np.frombuffer(f.read(4), dtype=np.single)
                    dt = np.frombuffer(f.read(4), dtype=np.single)
                    pos_fak = f.tell()
                    fak = []

                    for n in sensors:
                        f.seek(pos_fak + 4 * (n - 1))
                        fak.append(np.frombuffer(f.read(4), dtype=np.single))

                    f.seek(pos_fak + lvek[0] * 4)
                    position = f.tell()

                    # Calculate the file size
                    f.seek(0, os.SEEK_END)
                    file_size = f.tell()

                    record_count = round((file_size - position) / lvek[0] / 2)

                    max_index = record_count - sample_size
                    sampled_indices = []
                    while len(sampled_indices) < 1:
                        start_idx = random.randint(0, max_index)
                        indices = list(range(start_idx, start_idx + sample_size))
                        if not any(idx in sampled_indices for idx in indices):
                            sampled_indices = indices

                    for sample_idx, n in enumerate(sampled_indices, start=1):
                        raw_ts_int = np.empty((sample_size, len(sensors)))

                        for i, m in enumerate(sensors):
                            f.seek(position + 2 * m + (lvek[0] * 2) * n - 2)
                            raw_ts_int[:, i] = np.frombuffer(f.read(2 * sample_size), dtype=np.int16) * fak[i]

                        dfs.append(raw_ts_int)

                        sensor_labels.extend([f"Sensor {sensor}_{sample_idx}" for sensor in sensors])

                file_count += 1

            for f in file_handles:
                f.close()

            combined_array = np.concatenate(dfs, axis=0) 

            combined_mmap_path = os.path.join(mmap_dir, f"{folder_name}_combine")
            os.makedirs(combined_mmap_path, exist_ok=True)

            for i, data in enumerate(combined_array):
                for sample_idx in range(num_samples_per_folder):
                    sample_file_path = os.path.join(combined_mmap_path, f"{folder_name}_combine_{sample_idx+1}.npy")
                    np.save(sample_file_path, data)

    return combined_array, sensor_labels

folder_path = r"C:\Users\musab\raw"
sensors = (1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 45, 53, 54, 56, 57, 59, 60, 61, 62, 63, 64)
mmap_dir = r"C:\Users\musab\mmap_files_combined12" 
sample_size = 1000 
num_samples_per_folder = 3

os.makedirs(mmap_dir, exist_ok=True)

result_array, sensor_labels = read_int_files(folder_path, sensors, mmap_dir, sample_size, num_samples_per_folder)

result_array = result_array.reshape((-1, len(sensors)))

df = pd.DataFrame(result_array, columns=sensor_labels[:len(sensors)])

print(df)


 a more optimized approach for storing and accessing the data by utilizing memory-mapped files

In [3]:
import os
import numpy as np
import pandas as pd
import random

def read_int_files(folder_path, sensors, mmap_dir, sample_size):
    for folder_name in os.listdir(folder_path):
        if folder_name.startswith("Yaw"):
            folder_dir = os.path.join(folder_path, folder_name)
            files = []
            file_handles = [] 

            for file_name in os.listdir(folder_dir):
                if file_name.endswith(".int"):
                    file_path = os.path.join(folder_dir, file_name)
                    files.append(file_path)

                    file_handle = open(file_path, "rb")
                    file_handles.append(file_handle)

            dfs = [] 
            sensor_labels = []  

            for f in file_handles:
                leer = np.frombuffer(f.read(4), dtype=np.single)
                datei = np.frombuffer(f.read(24), dtype=np.uint32)
                f.seek(76)
                lvek = np.frombuffer(f.read(4), dtype=np.uint32)
                f.seek(84 + lvek[0] * 4)
                filetype = np.frombuffer(f.read(4), dtype=np.uint32)  

                if filetype != 12:
                    t0 = np.frombuffer(f.read(4), dtype=np.single)
                    dt = np.frombuffer(f.read(4), dtype=np.single)
                    pos_fak = f.tell()
                    fak = []

                    for n in sensors:
                        f.seek(pos_fak + 4 * (n - 1))
                        fak.append(np.frombuffer(f.read(4), dtype=np.single))

                    f.seek(pos_fak + lvek[0] * 4)
                    position = f.tell()

                    f.seek(0, os.SEEK_END)
                    file_size = f.tell()

                    record_count = round((file_size - position) / lvek[0] / 2)

                    sampled_indices = random.sample(range(record_count), sample_size)

                    raw_ts_int = np.empty((sample_size, len(sensors)))

                    for i, n in enumerate(sampled_indices):
                        for m, sensor_ in enumerate(sensors):
                            f.seek(position + 2 * sensor_ + (lvek[0] * 2) * n - 2)
                            raw_ts_int[i, m] = np.frombuffer(f.read(2), dtype=np.int16) * fak[m]

                    dfs.append(raw_ts_int)

                    sensor_labels.extend([f"Sensor {sensor}" for sensor in sensors])

            for f in file_handles:
                f.close()

            combined_array = np.concatenate(dfs, axis=0)

            combined_mmap_path = os.path.join(mmap_dir, f"{folder_name}_combine.mmap")
            combined_mmap = np.memmap(combined_mmap_path, dtype=np.float32, mode="w+", shape=combined_array.shape)
            combined_mmap[:] = combined_array

    return combined_mmap, sensor_labels

folder_path = r"C:\Users\musab\raw"
sensors = (1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 45, 53, 54, 56, 57, 59, 60, 61, 62, 63, 64)
mmap_dir = r"C:\Users\musab\mmap_files_combined20" 
sample_size = 1000

os.makedirs(mmap_dir, exist_ok=True)

result_mmap, sensor_labels = read_int_files(folder_path, sensors, mmap_dir, sample_size)

result_mmap = result_mmap.reshape((-1, len(sensors)))

df = pd.DataFrame(result_mmap, columns=sensor_labels[:len(sensors)])

print(df)


      Sensor 1  Sensor 4      Sensor 5      Sensor 6      Sensor 7  \
0     3.626359 -6.382661 -2.530507e-14  1.896010e-12 -1.774737e-12   
1     3.987640 -5.055549 -1.253245e-13  4.656512e-12 -4.449358e-12   
2     3.760442 -2.383435 -1.703948e-14  8.775953e-13 -8.326901e-13   
3     3.702145 -5.156790 -1.449792e-14  1.169811e-12 -1.096696e-12   
4     4.555795 -3.450735 -4.864319e-14  3.776233e-12 -3.514313e-12   
...        ...       ...           ...           ...           ...   
4995  4.262255 -4.940429 -1.045245e-13  7.611523e-13 -8.791988e-13   
4996  4.134300 -7.947215 -6.002635e-14  9.622051e-13 -9.762522e-13   
4997  4.432690 -5.423343 -1.487730e-13  5.036474e-12 -4.728082e-12   
4998  4.447196 -3.977781 -7.844533e-14  4.282034e-12 -3.941724e-12   
4999  4.151050 -4.246066 -1.532859e-13  3.281190e-12 -3.179892e-12   

        Sensor 8    Sensor 9  Sensor 10  Sensor 11   Sensor 12  ...  \
0     144.406952  609.104858   7.662589   1.010031   64.423538  ...   
1      12.699443 