# Please copy `nist` data to `./data/` first.

![nist data](./assets/nist.jpg)

In [1]:
import os
import glob
import json 
import nmrglue as ng
import numpy as np
import matplotlib.pyplot as plt
import pickle as pickle
import pandas as pd
import random
import math
from sklearn.model_selection import train_test_split
from collections import Counter, defaultdict, OrderedDict

from scipy import interpolate

%matplotlib inline

In [2]:
from lib.carrier import SpectraCarrier
import lib.utils as utils

In [3]:
import rdkit
print(rdkit.__version__)

from rdkit import Chem
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

2020.09.1


In [4]:
pd.set_option('display.max_columns', None)

# 1. Load spectrum from NIST dataset

In [5]:
curve_begin_idx = 600
curve_correction_factor = 0.5

In [6]:
DATA_DIR = './data'
SOURCE_DIR = './data/source'
TARGET_DIR = './data/target'

In [7]:
fn_train_df = '{}/train_df.pk'.format(SOURCE_DIR)
fn_valid_df = '{}/valid_df.pk'.format(SOURCE_DIR)
fn_test_df = '{}/test_df.pk'.format(SOURCE_DIR)

In [8]:
loaded_train_df = None
with open(fn_train_df, 'rb') as file:
    loaded_train_df = pickle.load(file)
    
loaded_valid_df = None
with open(fn_valid_df, 'rb') as file:
    loaded_valid_df = pickle.load(file)
    
loaded_test_df = None
with open(fn_test_df, 'rb') as file:
    loaded_test_df = pickle.load(file)

In [9]:
def order(x_i, y_i):
    x_o = x_i
    y_o = y_i

    first_x = x_i[0]
    last_x = x_i[-1]
    if first_x > last_x:
        x_o = x_o[::-1]
        y_o = y_o[::-1]
    return x_o, y_o


def concat_boundary(x_i, y_i):
    x_head = np.array([0.0])
    x_tail = np.array([4000.0])
    x_o = np.concatenate([x_head, x_i, x_tail])

    y_head = np.array([y_i[0]])
    y_tail = np.array([y_i[-1]])
    y_o = np.concatenate([y_head, y_i, y_tail])
    return x_o, y_o


def sampling(x_i, y_i):
    f = interpolate.interp1d(x_i, y_i)
    x_o = np.linspace(0, 3999, 4000, endpoint=True)
    y_o = f(x_o)
    return x_o[curve_begin_idx:], y_o[curve_begin_idx:]


def normalize(x_i, y_i):
    max_y = np.max(y_i)
    min_y = np.min(y_i)
    height = max_y - min_y

    x_o = x_i
    y_o = (y_i - min_y) / height
    return x_o, y_o


def use_transmission(x_i, y_i):
    x_o = x_i
    y_o = y_i
    
    y_median = np.median(y_i)
    if y_median < 0.5:
        y_o = y_i # 1 - y_i
    return x_o, y_o


def chop(x_i, y_i):
    x_o = x_i[0:]
    y_o = y_i[0:]
    
    return x_o, y_o


def read_spectrum(path):
    sc = SpectraCarrier(path)

    # original
    x_orig, y_orig = order(sc.x, sc.y)

    # concatenate
    x_basis, y_basis = concat_boundary(x_orig, y_orig)

    # interpolate
    x_new, y_new = sampling(x_basis, y_basis)
    y_new = y_new ** curve_correction_factor

    # normalized
    x_new_norm, y_new_norm = normalize(x_new, y_new)

    # absorption
    x_new_norm_abs, y_new_norm_abs = use_transmission(x_new_norm, y_new_norm)

    # chop
    x_nnac, y_nnac = chop(x_new_norm_abs, y_new_norm_abs)
    
    return x_nnac, y_nnac

In [10]:
for idx, row in loaded_train_df.iterrows():
    if len(row['fn']) < 30:
        done = False
        fname = '{}/nist/Individual_Files/EPA/DX/{}.DX'.format(DATA_DIR, row['fn'])
        if os.path.isfile(fname):
            _, y = read_spectrum(fname)
            loaded_train_df.at[idx, 'spectrum'] = y
            done = True
        fname = '{}/nist/Individual_Files/NIST/DX/{}.DX'.format(DATA_DIR, row['fn'])
        if not done and os.path.isfile(fname):
            _, y = read_spectrum(fname)
            loaded_train_df.at[idx, 'spectrum'] = y
            
for idx, row in loaded_valid_df.iterrows():
    if len(row['fn']) < 30:
        done = False
        fname = '{}/nist/Individual_Files/EPA/DX/{}.DX'.format(DATA_DIR, row['fn'])
        if os.path.isfile(fname):
            _, y = read_spectrum(fname)
            loaded_valid_df.at[idx, 'spectrum'] = y
            done = True
        fname = '{}/nist/Individual_Files/NIST/DX/{}.DX'.format(DATA_DIR, row['fn'])
        if not done and os.path.isfile(fname):
            _, y = read_spectrum(fname)
            loaded_valid_df.at[idx, 'spectrum'] = y
            
for idx, row in loaded_test_df.iterrows():
    if len(row['fn']) < 30:
        done = False
        fname = '{}/nist/Individual_Files/EPA/DX/{}.DX'.format(DATA_DIR, row['fn'])
        if os.path.isfile(fname):
            _, y = read_spectrum(fname)
            loaded_test_df.at[idx, 'spectrum'] = y
            done = True
        fname = '{}/nist/Individual_Files/NIST/DX/{}.DX'.format(DATA_DIR, row['fn'])
        if not done and os.path.isfile(fname):
            _, y = read_spectrum(fname)
            loaded_test_df.at[idx, 'spectrum'] = y

In [11]:
counter = 0
for idx, row in loaded_train_df.iterrows():
    if row['spectrum'] is None:
        counter += 1
print('Load all spectra.' if counter == 0 else 'Some spectra are missing, please check them.')

counter = 0
for idx, row in loaded_valid_df.iterrows():
    if row['spectrum'] is None:
        counter += 1
print('Load all spectra.' if counter == 0 else 'Some spectra are missing, please check them.')

counter = 0
for idx, row in loaded_test_df.iterrows():
    if row['spectrum'] is None:
        counter += 1
print('Load all spectra.' if counter == 0 else 'Some spectra are missing, please check them.')

Load all spectra.
Load all spectra.
Load all spectra.


# 2. Write dataframe

In [12]:
fn_train_df = '{}/df_train.pk'.format(TARGET_DIR)
fn_valid_df = '{}/df_valid.pk'.format(TARGET_DIR)
fn_test_df = '{}/df_test.pk'.format(TARGET_DIR)

In [13]:
with open(fn_train_df, 'wb') as file:
    pickle.dump(loaded_train_df, file)
    
with open(fn_valid_df, 'wb') as file:
    pickle.dump(loaded_valid_df, file)
    
with open(fn_test_df, 'wb') as file:
    pickle.dump(loaded_test_df, file)

In [15]:
print('Load NIST spectra into dataframes. Ready to train the model.')

Load NIST spectra into dataframes. Ready to train the model.
