## README

V2.4 update: 
* increased the low level of noise
* Increased train-set to all 186 different types of MOF
* Added more experimental data to the test-set

This is the Ver. 2.2 for the data pre-processor. After running this program, multiple sets of the spectra data shall be created. 

1. Multiple "experimental" spectra will be generated using the original processor with Gaussian mixture distribution.
2. Transformation of the data including sigmoid transformation, squashed spectra, ReLU instead of normalization

The output of the program shall be several pickle files with different transformation performed on the simulated spectra. Spectra simulation will also provide multiple options, including low noise, medium noise and high noise.

In [1]:
import os
import re
import random
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
import os
loc = 'Train_data/'
files = os.listdir(loc) # Which files are under the ZIF folder

class data_processes(object):
    
    def __init__(self, x, y, label): # Taking the values into the Python class
        self.grid = x
        self.values = y
        self.label = label
        
    def max_min(self): # Max-min scaler for the values
        self.grid = (self.grid - np.min(self.grid)) / (np.max(self.grid) - np.min(self.grid))
        self.values = (self.values - np.min(self.values)) / (np.max(self.values) - np.min(self.values))
        
    def ReLU(self):
        self.values = np.maximum(0, self.values)
        
    def plot(self): # Make plots of the spectra
        plt.plot(self.grid, self.values, label = self.label)
        
    def fit_grid(self, step_size): # Interpolate the spectra to normalize the grids
        self.grid_step = step_size
        fitting_grids = np.arange(0,1,step_size)

        new_values = []
        for g in fitting_grids:
            idx = np.argmin(abs(g - self.grid[:-1]))
            interp_x1 = self.grid[idx]
            interp_y1 =  self.values[idx]
            if (g - interp_x1) > 0:
                interp_x2 = self.grid[idx + 1]
                interp_y2 = self.values[idx + 1]
            elif (g - interp_x1) < 0:
                interp_x2 = self.grid[idx - 1]
                interp_y2 = self.values[idx - 1]
            else:
                new_values.append(interp_y1)
                continue

            new_values.append(np.maximum((interp_y2 - interp_y1) / (interp_x2 - interp_x1)\
                                            *(g - interp_x1) + interp_y1, 0.0))
        
        fitting_grids = np.append(fitting_grids, 1)
        new_values.append(self.values[-1])
        
        self.grid = fitting_grids
        self.values = new_values

    def back_cancel(self, scaler): # Cancel the background
        self.values[self.values <= (np.mean(self.values) - scaler * np.std(self.values))] = 0.0
        
    def der_1st(self): # Getting the 1st order derivatives
        self.der1 = []
        for i in range(2,  len(self.grid)):
            self.der1.append((self.values[i] - self.values[i - 1]) / (self.grid[i] - self.grid[i - 1]))           
    
    def der_2nd(self): # Getting the 2nd order derivatives
        self.der2 = []
        for i in range(2, len(self.grid) - 1):
            self.der2.append((self.values[i + 1] - 2 * self.values[i] + self.values[i - 1]) / (self.grid_step) ** 2)
    
    def get_peaks(self): # Get the location of peaks using 1st order derivatives
        self.zeros = []
        for i in range(2, len(self.grid) - 2):
            if self.der1[i-1]<0 and self.der1[i]>0:
                self.zeros.append(self.grid[i+1])
            elif self.der1[i-1]>0 and self.der1[i]<0:
                self.zeros.append(self.grid[i+1])
                
    def get_widths(self): # Get the width of the peaks using 2nd order derivatives
        self.widths = []
        width = []
        for i in range(2, len(self.grid) - 3):
            if self.der2[i-1]<0 and self.der2[i]>0:
                width.append(self.grid[i+1])
            elif self.der2[i-1]>0 and self.der2[i]<0:
                width.append(self.grid[i+1])
                
        i = 0
        while i < (len(width) -1):
            self.widths.append(width[i+1] - width[i])
            i += 2
            
    def GMM_alike(self): # Simulate a original-alike spectra given provided information
        x = np.arange(0, 1 + self.grid_step, self.grid_step)
        y = np.zeros(x.shape)
        for width, zero in zip(self.widths, self.zeros):
            idx = np.where(self.grid == zero)
            val = self.values[idx]
            
            S = width / 2
            y += val * np.exp(- ( x - zero ) ** 2 / (2 * S **2)) 
        
        return x, y
    
    def GMM(self, width_scaler, peak_shifter, peakval_shaper, noise_size, ReLU = False, transformer = 'None'): # Synthesize spectra using shifted parameters
        x = np.arange(0, 1 + self.grid_step, self.grid_step)
        y = np.zeros(x.shape)
        low, high = peakval_shaper
        for width, zero in zip(self.widths, self.zeros):
            idx = np.where(self.grid == zero)
            val = self.values[idx]
            S = width / 2
            
            zero = zero + np.random.normal(loc = 0, scale = np.random.uniform(0, peak_shifter * width))
            S = S + np.random.normal(loc = 0, scale = np.random.uniform(0, width_scaler * width))
            
            if S == 0:
                S = width / 2
            
            val = val * np.random.uniform(low, high)
            y += val * np.exp(- ( x - zero ) ** 2 / (2 * S **2)) 
        
        y = y + np.random.normal(loc = 0.0, scale = np.random.uniform(0, noise_size), size = y.shape)
        
        if ReLU == True:
            y = np.maximum(y, 0)
        else:
            y = (y - np.min(y)) / (np.max(y) - np.min(y))
           
        if transformer == 'sigmoid':
            y = (1 - np.cos(np.pi * y)) / 2
        elif transformer == 'squash':
            y = np.sqrt(y)
        
        return x, y

In [3]:
def processor(class_nums, level, transformer = 'None', ReLU = False, test_simulator = False):
    
    loc = 'Train_data/'
    files = os.listdir(loc) # Which files are under the ZIF folder
    files.remove('.DS_Store')
    ##########################################################################
    # Train Processor
    Data = pd.DataFrame()
    
    if level == 'low':
        a, b, c, d, e = 0.05, 0.05, 0.95, 1.05, 0.05
    elif level == 'medium':
        a, b, c, d, e = 0.25, 0.25, 0.75, 1.25, 0.25
    else:
        a, b, c, d, e = 0.5, 0.5, 0.5, 1.5, 0.5

    for i in range(class_nums):

        try:
            cat_name = re.findall(r'XRD\s*\d*\s(.*)\.txt', files[i])[0]
        except:
            cat_name = re.findall(r'\d+ MOF (.*) TXT\.txt', files[i])[0]
            
        if i % 46 == 0:
            print('Processing percentage: {}%'.format(i // 46 * 25))

        with open(loc + files[i],'r') as f:
            data = f.readlines()

        data = [(lambda x: (float(x[0]), float(x[1])))(d.split()) for d in data]

        df = {}
        X = [(lambda x: x[0])(d) for d in data]
        Y = [(lambda x: x[1])(d) for d in data]

        prep = data_processes(X, Y, cat_name)
        prep.max_min()
        prep.fit_grid(1e-3)
        
        if ReLU:
            prep.ReLU()
        else:
            prep.max_min()

        for value, grid in zip(prep.values, prep.grid):
            df[grid] = [value for k in range(50)]

        prep.back_cancel(0)
        prep.der_1st()
        prep.der_2nd()
        prep.get_peaks()
        prep.get_widths()
        for i in range(750):
            X, Y = prep.GMM(a, b, (c, d), e, ReLU, transformer)

            j = 0
            for k, v in df.items():
                df[k].append(Y[j])
                j+=1

        cols = [(lambda x: 'grid_' + str(x))(l) for l in prep.grid]
        Df = pd.DataFrame(df)
        Df.columns = cols

        Df['label'] = prep.label
        Df = Df.dropna()

        Data = pd.concat([Data, Df], axis = 0)
     
    Data.head()
    
    if ReLU:
        r = 'relu'
    else:
        r = ''

    pickle_name = 'train_' + level + '_' + transformer + '_' + r + '.pickle'
    with open(pickle_name, 'wb') as f:
        pickle.dump(Data, f)
        
    ########################################################################
    # Test Processor
    
    loc = 'Test_Data/'
    Test = pd.DataFrame()
    Files = os.listdir(loc)
    Files.remove('.DS_Store')

    for i in range(len(Files)):
        
        cat_name = re.findall(r'(ZIF-\d+)\s.*.txt',Files[i])[0]

        with open(loc + Files[i],'r') as f:
            data = f.readlines()
        
        new_data = []
        for d in data:
            try:
                l = d.split()
                if l != []:
                    new_data.append((float(l[0]), float(l[1])))
            except:
                l = d.split(',')
                if l != []:
                    new_data.append((float(l[0]), float(l[1])))

        data = new_data
        
        # 1. Normalization: X-axis and Y-axis

        X = [(lambda x: x[0])(d) for d in data]
        Y = [(lambda x: x[1])(d) for d in data]
        # X = X[0:(int((50 - min(X)) / 0.02) + 1)]
        # Y = Y[0:(int((50 - min(X)) / 0.02) + 1)]

        prep = data_processes(X, Y, cat_name)
        prep.max_min()
        prep.fit_grid(1e-3)
            
        if test_simulator == True:
            prep.max_min()
            prep.back_cancel(-0.35)
            prep.der_1st()
            prep.der_2nd()
            prep.get_peaks()
            prep.get_widths()
            X, Y = prep.GMM_alike()
        else:
            X, Y = prep.grid, prep.values
        
        if ReLU == True:
            Y = np.maximum(Y, 0)
        else:
            Y = (Y - np.min(Y)) / (np.max(Y) - np.min(Y))
           
        if transformer == 'sigmoid':
            Y = (1 - np.cos(np.pi * Y)) / 2
        elif transformer == 'squash':
            Y = np.sqrt(Y)

        df = {}
        for grid, value in zip(X, Y):
            df[grid] = [value for k in range(2)]

        cols = [(lambda x: 'grid_' + str(x))(l) for l in prep.grid]
        Df = pd.DataFrame(df)
        Df.columns = cols
        Df = Df.drop(1, axis = 0)

        Df['label'] = prep.label

        Test= pd.concat([Test, Df], axis = 0)

    Test = Test.reset_index()
    Test = Test.drop(['index'], axis = 1)
    Test.head()
    
    
    pickle_name = 'test_' + '_' + transformer + '_' + r + '.pickle'
    with open(pickle_name, 'wb') as f:
        pickle.dump(Test, f)

In [4]:
processor(186, 'low')

Processing percentage: 0%
Processing percentage: 25%
Processing percentage: 50%
Processing percentage: 75%
Processing percentage: 100%
