<a href="https://colab.research.google.com/github/GuoyaoShen/DiatomDL/blob/main/data_generate_allsilicone.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This script is for "all silicon terahertz" data

# Mount Drive

In [42]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Change Working Dir

In [43]:
import os

path = '/content/drive/My Drive/BU Class/Research/BioDL_Project'
os.chdir(path)
os.listdir(path)

print('DONE')

DONE


# Data Loading

In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import re

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, transforms

In [45]:
path_reflection_dir1 = './data/3212 sets of data/R1_'
path_transmission_dir1 = './data/3212 sets of data/T1_'
path_reflection_dir2 = './data/3212 sets of data/R2_'
path_transmission_dir2 = './data/3212 sets of data/T2_'

NUM_SETSIZE = 130  # num of files for the datasets (for one spectrum)
NUM_PARAM = 18  # num of params in the file (only selected ones)

## Define Data Reading Function

In [46]:
def read_data_txt(PATH_DIR, NUM_SETSIZE, NUM_PARAM):
    data_param = np.array([]).reshape(0,NUM_PARAM)
    data_spectra = np.array([]).reshape(0,1001,2)

    for idx_file in range(NUM_SETSIZE):
        path_file = PATH_DIR + str(idx_file+1) + '.txt'

        # read file
        print('========================================= FILE '+str(idx_file+1)+' =========================================')
        num_combination = 0
        with open(path_file) as f:
            lines=f.readlines()

            spectra_all = np.array([]).reshape(0,1001,2)  # shape for each spectra: [1001,2]
            spectra = np.array([]).reshape(0,2)
            param_all = np.array([]).reshape(0,NUM_PARAM)

            for i, line in enumerate(lines):
                if (i%1004!=0) & (i%1004!=1) & (i%1004!=2): # read spectra data
                    line_array = np.fromstring(line, dtype=float, sep=' ')
                    spectra = np.vstack((spectra, line_array))

                if i%1004==0:  # every (3+1001) lines, read param title
                    # param = [float(s) for s in re.findall(r"[-+]?\d*\.\d+|\d+", line)]  # extract the float param
                    param = [float(s) for s in re.findall(r"[-+]?\d*\.\d+|\d+(?=;|})", line)]  # extract the float param
                    param = np.array(param)
                    print(num_combination+1, param)
                    print('------------')
                    param_all = np.vstack((param_all, param))
                    num_combination += 1

                if i%1004==1003:  # every end of the combination, concat
                    spectra_all = np.concatenate((spectra_all, spectra[np.newaxis, ...]), axis=0)
                    spectra = np.array([]).reshape(0,2)
        
        # concat data
        data_param = np.concatenate((data_param, param_all), axis=0)
        data_spectra = np.concatenate((data_spectra, spectra_all), axis=0)
    return data_param, data_spectra

## Read data

In [47]:
data_param_R1, data_spectra_R1 = read_data_txt(path_reflection_dir1, NUM_SETSIZE, NUM_PARAM)
print('data_param_R1.shape:', data_param_R1.shape)
print('data_spectra_R1.shape:', data_spectra_R1.shape)
print(np.unique(data_param_R1[...,0]))

[1;30;43m串流輸出內容已截斷至最後 5000 行。[0m
------------
19 [ 70.   70.   70.   70.   70.   70.    0.  346.4 200.   10.   46.   10.
  10.   10.   10.   10.   61.    0. ]
------------
20 [ 70.   70.   70.   70.   70.   70.    0.  346.4 200.   10.   48.   10.
  10.   10.   10.   10.   61.    0. ]
------------
21 [ 70.   70.   70.   70.   70.   70.    0.  346.4 200.   10.   50.   10.
  10.   10.   10.   10.   61.    0. ]
------------
22 [ 70.   70.   70.   70.   70.   70.    0.  346.4 200.   12.   10.   12.
  12.   12.   12.   12.   61.    0. ]
------------
23 [ 70.   70.   70.   70.   70.   70.    0.  346.4 200.   12.   12.   12.
  12.   12.   12.   12.   61.    0. ]
------------
24 [ 70.   70.   70.   70.   70.   70.    0.  346.4 200.   12.   14.   12.
  12.   12.   12.   12.   61.    0. ]
------------
25 [ 70.   70.   70.   70.   70.   70.    0.  346.4 200.   12.   16.   12.
  12.   12.   12.   12.   61.    0. ]
------------
1 [ 70.   70.   70.   70.   70.   70.    0.  346.4 200.   12.   18.   

In [48]:
data_param_R2, data_spectra_R2 = read_data_txt(path_reflection_dir2, NUM_SETSIZE, NUM_PARAM)
print('data_param_R2.shape:', data_param_R2.shape)
print('data_spectra_R2.shape:', data_spectra_R2.shape)

[1;30;43m串流輸出內容已截斷至最後 5000 行。[0m
18 [ 70.   70.   70.   70.   70.   70.    0.  346.4 200.   10.   44.   10.
  10.   10.   10.   10.   61.    0. ]
------------
19 [ 70.   70.   70.   70.   70.   70.    0.  346.4 200.   10.   46.   10.
  10.   10.   10.   10.   61.    0. ]
------------
20 [ 70.   70.   70.   70.   70.   70.    0.  346.4 200.   10.   48.   10.
  10.   10.   10.   10.   61.    0. ]
------------
21 [ 70.   70.   70.   70.   70.   70.    0.  346.4 200.   10.   50.   10.
  10.   10.   10.   10.   61.    0. ]
------------
22 [ 70.   70.   70.   70.   70.   70.    0.  346.4 200.   12.   10.   12.
  12.   12.   12.   12.   61.    0. ]
------------
23 [ 70.   70.   70.   70.   70.   70.    0.  346.4 200.   12.   12.   12.
  12.   12.   12.   12.   61.    0. ]
------------
24 [ 70.   70.   70.   70.   70.   70.    0.  346.4 200.   12.   14.   12.
  12.   12.   12.   12.   61.    0. ]
------------
25 [ 70.   70.   70.   70.   70.   70.    0.  346.4 200.   12.   16.   12.
  12.   

In [49]:
data_param_T1, data_spectra_T1 = read_data_txt(path_transmission_dir1, NUM_SETSIZE, NUM_PARAM)
print('data_param_T1.shape:', data_param_T1.shape)
print('data_spectra_T1.shape:', data_spectra_T1.shape)

[1;30;43m串流輸出內容已截斷至最後 5000 行。[0m
18 [ 70.   70.   70.   70.   70.   70.    0.  346.4 200.   10.   44.   10.
  10.   10.   10.   10.   61.    0. ]
------------
19 [ 70.   70.   70.   70.   70.   70.    0.  346.4 200.   10.   46.   10.
  10.   10.   10.   10.   61.    0. ]
------------
20 [ 70.   70.   70.   70.   70.   70.    0.  346.4 200.   10.   48.   10.
  10.   10.   10.   10.   61.    0. ]
------------
21 [ 70.   70.   70.   70.   70.   70.    0.  346.4 200.   10.   50.   10.
  10.   10.   10.   10.   61.    0. ]
------------
22 [ 70.   70.   70.   70.   70.   70.    0.  346.4 200.   12.   10.   12.
  12.   12.   12.   12.   61.    0. ]
------------
23 [ 70.   70.   70.   70.   70.   70.    0.  346.4 200.   12.   12.   12.
  12.   12.   12.   12.   61.    0. ]
------------
24 [ 70.   70.   70.   70.   70.   70.    0.  346.4 200.   12.   14.   12.
  12.   12.   12.   12.   61.    0. ]
------------
25 [ 70.   70.   70.   70.   70.   70.    0.  346.4 200.   12.   16.   12.
  12.   

In [50]:
data_param_T2, data_spectra_T2 = read_data_txt(path_transmission_dir2, NUM_SETSIZE, NUM_PARAM)
print('data_param_T2.shape:', data_param_T2.shape)
print('data_spectra_T2.shape:', data_spectra_T2.shape)

[1;30;43m串流輸出內容已截斷至最後 5000 行。[0m
18 [ 70.   70.   70.   70.   70.   70.    0.  346.4 200.   10.   44.   10.
  10.   10.   10.   10.   61.    0. ]
------------
19 [ 70.   70.   70.   70.   70.   70.    0.  346.4 200.   10.   46.   10.
  10.   10.   10.   10.   61.    0. ]
------------
20 [ 70.   70.   70.   70.   70.   70.    0.  346.4 200.   10.   48.   10.
  10.   10.   10.   10.   61.    0. ]
------------
21 [ 70.   70.   70.   70.   70.   70.    0.  346.4 200.   10.   50.   10.
  10.   10.   10.   10.   61.    0. ]
------------
22 [ 70.   70.   70.   70.   70.   70.    0.  346.4 200.   12.   10.   12.
  12.   12.   12.   12.   61.    0. ]
------------
23 [ 70.   70.   70.   70.   70.   70.    0.  346.4 200.   12.   12.   12.
  12.   12.   12.   12.   61.    0. ]
------------
24 [ 70.   70.   70.   70.   70.   70.    0.  346.4 200.   12.   14.   12.
  12.   12.   12.   12.   61.    0. ]
------------
25 [ 70.   70.   70.   70.   70.   70.    0.  346.4 200.   12.   16.   12.
  12.   

In [51]:
# double check param array

print(np.any([[True, False], [False, False]]))
print(np.any([[False, False], [False, False]]))
print('---')

print(np.any(data_param_T1 - data_param_R1))  # if this is false, then correct
print(np.any(data_param_T1 - data_param_R2))
print(np.any(data_param_T1 - data_param_T2))

True
False
---
False
False
False


# Check Params Range

# Save Data

In [52]:
path_dataset = './data/allsilicone_r1r2r7_3212.npz'

np.savez(path_dataset, param=data_param_T1, R1=data_spectra_R1, R2=data_spectra_R2, T1=data_spectra_T1, T2=data_spectra_T2)

print('DATASET SAVED')

DATASET SAVED


In [53]:
data = np.load(path_dataset)
print(data['param'].shape)
print(data['R1'].shape)
print(data['T1'].shape)
print(data['R1'][0])
print(data['T1'][0])

(3212, 18)
(3212, 1001, 2)
(3212, 1001, 2)
[[0.5        0.7195911 ]
 [0.501      0.71894922]
 [0.502      0.71829784]
 ...
 [1.498      0.15370129]
 [1.499      0.15226866]
 [1.5        0.15100287]]
[[0.5        0.54685509]
 [0.501      0.54749198]
 [0.502      0.54813256]
 ...
 [1.498      0.8101815 ]
 [1.499      0.81033432]
 [1.5        0.81046313]]
