<a href="https://colab.research.google.com/github/GuoyaoShen/DiatomDL/blob/main/data_generate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mount Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Data Loading

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import re

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, transforms

In [3]:
path_reflection_dir = '/content/drive/My Drive/BU Class/Research/BioDL_Project/data/diatom_219/R'
path_transmission_dir = '/content/drive/My Drive/BU Class/Research/BioDL_Project/data/diatom_219/T'

## Reflection data

In [4]:
data_param_R = np.array([]).reshape(0,11)
data_spectra_R = np.array([]).reshape(0,1001,2)

for idx_file in range(9):
    path_file = path_reflection_dir + str(idx_file+1) + '.txt'

    # read file
    print('========================================= FILE '+str(idx_file)+' =========================================')
    num_combination = 0
    with open(path_file) as f:
        lines=f.readlines()

        spectra_all = np.array([]).reshape(0,1001,2)  # shape for each spectra: [1001,2]
        spectra = np.array([]).reshape(0,2)
        param_all = np.array([]).reshape(0,11)

        for i, line in enumerate(lines):
            if (i%1004!=0) & (i%1004!=1) & (i%1004!=2): # read spectra data
                line_array = np.fromstring(line, dtype=float, sep=' ')
                spectra = np.vstack((spectra, line_array))

            if i%1004==0:  # every (3+1001) lines, read param title
                param = [float(s) for s in re.findall(r"[-+]?\d*\.\d+|\d+", line)]  # extract the float param
                print(num_combination, param)
                print('------------')
                param_all = np.vstack((param_all, param))
                num_combination += 1

            if i%1004==1003:  # every end of the combination, concat
                # print(num_combination)
                spectra_all = np.concatenate((spectra_all, spectra[np.newaxis, ...]), axis=0)
                spectra = np.array([]).reshape(0,2)

    
    # concat data
    data_param_R = np.concatenate((data_param_R, param_all), axis=0)
    data_spectra_R = np.concatenate((data_spectra_R, spectra_all), axis=0)

0 [0.5, 0.04, 0.0, 0.17, 0.0, 0.7794, 0.45, 0.1, 0.03, 0.02, 0.0]
------------
1 [0.5, 0.05, 0.0, 0.17, 0.0, 0.7794, 0.45, 0.1, 0.03, 0.02, 0.0]
------------
2 [0.5, 0.06, 0.0, 0.17, 0.0, 0.7794, 0.45, 0.1, 0.03, 0.02, 0.0]
------------
3 [0.5, 0.07, 0.0, 0.17, 0.0, 0.7794, 0.45, 0.1, 0.03, 0.02, 0.0]
------------
4 [0.5, 0.08, 0.0, 0.17, 0.0, 0.7794, 0.45, 0.1, 0.03, 0.02, 0.0]
------------
5 [0.5, 0.09, 0.0, 0.17, 0.0, 0.7794, 0.45, 0.1, 0.03, 0.02, 0.0]
------------
6 [0.5, 0.1, 0.0, 0.17, 0.0, 0.7794, 0.45, 0.1, 0.03, 0.02, 0.0]
------------
7 [0.5, 0.11, 0.0, 0.17, 0.0, 0.7794, 0.45, 0.1, 0.03, 0.02, 0.0]
------------
8 [0.5, 0.12, 0.0, 0.17, 0.0, 0.7794, 0.45, 0.1, 0.03, 0.02, 0.0]
------------
9 [0.5, 0.13, 0.0, 0.17, 0.0, 0.7794, 0.45, 0.1, 0.03, 0.02, 0.0]
------------
10 [0.5, 0.04, 0.0, 0.16, 0.0, 0.7794, 0.45, 0.1, 0.03, 0.02, 0.0]
------------
11 [0.5, 0.05, 0.0, 0.16, 0.0, 0.7794, 0.45, 0.1, 0.03, 0.02, 0.0]
------------
12 [0.5, 0.06, 0.0, 0.16, 0.0, 0.7794, 0.45, 0.1, 0

In [5]:
print(data_param_R.shape)
print(data_spectra_R.shape)

(219, 11)
(219, 1001, 2)


## Transmission data

In [6]:
data_param_T = np.array([]).reshape(0,11)
data_spectra_T = np.array([]).reshape(0,1001,2)

for idx_file in range(9):
    path_file = path_transmission_dir + str(idx_file+1) + '.txt'

    # read file
    print('========================================= FILE '+str(idx_file)+' =========================================')
    num_combination = 0
    with open(path_file) as f:
        lines=f.readlines()

        spectra_all = np.array([]).reshape(0,1001,2)  # shape for each spectra: [1001,2]
        spectra = np.array([]).reshape(0,2)
        param_all = np.array([]).reshape(0,11)

        for i, line in enumerate(lines):
            if (i%1004!=0) & (i%1004!=1) & (i%1004!=2): # read spectra data
                line_array = np.fromstring(line, dtype=float, sep=' ')
                spectra = np.vstack((spectra, line_array))

            if i%1004==0:  # every (3+1001) lines, read param title
                param = [float(s) for s in re.findall(r"[-+]?\d*\.\d+|\d+", line)]  # extract the float param
                print(num_combination, param)
                print('------------')
                param_all = np.vstack((param_all, param))
                num_combination += 1

            if i%1004==1003:  # every end of the combination, concat
                # print(num_combination)
                spectra_all = np.concatenate((spectra_all, spectra[np.newaxis, ...]), axis=0)
                spectra = np.array([]).reshape(0,2)

    
    # concat data
    data_param_T = np.concatenate((data_param_T, param_all), axis=0)
    data_spectra_T = np.concatenate((data_spectra_T, spectra_all), axis=0)

0 [0.5, 0.04, 0.0, 0.17, 0.0, 0.7794, 0.45, 0.1, 0.03, 0.02, 0.0]
------------
1 [0.5, 0.05, 0.0, 0.17, 0.0, 0.7794, 0.45, 0.1, 0.03, 0.02, 0.0]
------------
2 [0.5, 0.06, 0.0, 0.17, 0.0, 0.7794, 0.45, 0.1, 0.03, 0.02, 0.0]
------------
3 [0.5, 0.07, 0.0, 0.17, 0.0, 0.7794, 0.45, 0.1, 0.03, 0.02, 0.0]
------------
4 [0.5, 0.08, 0.0, 0.17, 0.0, 0.7794, 0.45, 0.1, 0.03, 0.02, 0.0]
------------
5 [0.5, 0.09, 0.0, 0.17, 0.0, 0.7794, 0.45, 0.1, 0.03, 0.02, 0.0]
------------
6 [0.5, 0.1, 0.0, 0.17, 0.0, 0.7794, 0.45, 0.1, 0.03, 0.02, 0.0]
------------
7 [0.5, 0.11, 0.0, 0.17, 0.0, 0.7794, 0.45, 0.1, 0.03, 0.02, 0.0]
------------
8 [0.5, 0.12, 0.0, 0.17, 0.0, 0.7794, 0.45, 0.1, 0.03, 0.02, 0.0]
------------
9 [0.5, 0.13, 0.0, 0.17, 0.0, 0.7794, 0.45, 0.1, 0.03, 0.02, 0.0]
------------
10 [0.5, 0.04, 0.0, 0.16, 0.0, 0.7794, 0.45, 0.1, 0.03, 0.02, 0.0]
------------
11 [0.5, 0.05, 0.0, 0.16, 0.0, 0.7794, 0.45, 0.1, 0.03, 0.02, 0.0]
------------
12 [0.5, 0.06, 0.0, 0.16, 0.0, 0.7794, 0.45, 0.1, 0

In [7]:
print(data_param_T.shape)
print(data_spectra_T.shape)

(219, 11)
(219, 1001, 2)


In [8]:
# double check param array

print(np.any([[True, False], [False, False]]))
print(np.any([[False, False], [False, False]]))

print(np.any(data_param_T - data_param_R))  # if this is false, then correct

True
False
False


# Check Params Range

In [9]:
d = data_param_T[...,1]
p = data_param_T[...,3]
t_top = data_param_T[...,8]

print('d min:', min(d))
print('d max:', max(d))
print('--------------------')
print('p min:', min(p))
print('p max:', max(p))
print('--------------------')
print('t_top min:', min(t_top))
print('t_top max:', max(t_top))

d min: 0.04
d max: 0.13
--------------------
p min: 0.08
p max: 0.17
--------------------
t_top min: 0.03
t_top max: 0.09


# Save Data

In [10]:
path_dataset = '/content/drive/My Drive/BU Class/Research/BioDL_Project/data/diatom_219.npz'

np.savez(path_dataset, param=data_param_T, R=data_spectra_R, T=data_spectra_T)

print('DATASET SAVED')

DATASET SAVED


In [11]:
data = np.load(path_dataset)
print(data['param'].shape)
print(data['R'].shape)
print(data['T'].shape)
print(data['R'][0])
print(data['T'][0])

(219, 11)
(219, 1001, 2)
(219, 1001, 2)
[[3.75000000e+02 9.40925333e-01]
 [3.75375000e+02 9.40649210e-01]
 [3.75750000e+02 9.40379784e-01]
 ...
 [7.49250000e+02 6.23663526e-01]
 [7.49625000e+02 6.23423301e-01]
 [7.50000000e+02 6.23182642e-01]]
[[3.75000000e+02 1.64127025e-02]
 [3.75375000e+02 1.64484295e-02]
 [3.75750000e+02 1.64848048e-02]
 ...
 [7.49250000e+02 2.78185928e-03]
 [7.49625000e+02 2.78830111e-03]
 [7.50000000e+02 2.79488470e-03]]
