# Module 3 - Building Neural Network with Input Data and Performing New Prediction

In order to execute this module, files generated from module 1 are needed:

CombinedAnalysis.xlsx --> USED FOR BUILDING NEURAL NET
meanAllsigma.xlsx --> USED FOR CONSTRUCTING IONOGRAMS OF PREDICTED LIPIDS
Intersected gauss.xlsx

This module is used to perform predictions on novel lipid isomer pairs.
The prediction output will be an excel file titled "Predicted Intersected Gauss.xlsx" which lists all the lipids user wishes to perform predictions on, each SV from 0 to 4100V at increment of 100V, and the predicted optimal CoV for each isomer. Output will also contain a column predicting the CoV at which the ionograms of the two isomers will intersect, and whether this intersection point is considered as "Separable" or "Inseparable".

## INSTALLATION OF IMPORTANT LIBRARIES AND PACKAGES

In [None]:
%matplotlib inline

# Import basic system parameters and functions 
import sys
import subprocess
import pkg_resources

# Several packages needed to be installed. We first will check to see if your environment has already had these packages installed.
required = {'pandas', 'seaborn', 'numpy', 'csaps', 'tensorflow', 'openpyxl', 'lmfit', 'scikit-learn'}
installed = {pkg.key for pkg in pkg_resources.working_set}
missing = required - installed

if missing:
    python = sys.executable
    subprocess.check_call([python, '-m', 'pip', 'install', *missing], stdout = subprocess.DEVNULL)

SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True    

# After checking, we now will import from the installed packages
import os
import pandas as pd
import seaborn as sns
import pylab as plb
import matplotlib.pyplot as mpl
from matplotlib import pyplot as plt
from matplotlib import mlab
import numpy as np
import heapq
import csaps
import shutil
import tkinter
from tkinter import filedialog
from tkinter import *
from csaps import csaps
from random import seed
from random import choice
from numpy import asarray as ar
from numpy import exp, linspace, random, pi, sqrt
from scipy import stats
from scipy import interpolate
from scipy.optimize import curve_fit
from scipy.stats import linregress
from scipy.stats import norm
from sklearn.metrics import r2_score
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
import math
import pkg_resources
import seaborn as sns
import sklearn
import subprocess
import tensorflow as tn
from tensorflow.keras import layers
from tensorflow.keras import models
from lmfit import Model, conf_interval, report_ci
from lmfit.models import ExpressionModel

pd.options.mode.chained_assignment = None

# Set Directory Location
currentpath = os.getcwd()
os.chdir(currentpath)

# Set up folder for outputs and plots
if not os.path.exists('Module3_Outputs'):
    os.makedirs(os.path.join('Module3_Outputs', 'PredictedSetPlots'))
    os.chmod('Module3_Outputs/PredictedSetPlots',0o666)
else:
    os.chmod('Module3_Outputs/PredictedSetPlots',0o666)
    shutil.rmtree('Module3_Outputs/PredictedSetPlots')
    os.makedirs(os.path.join("Module3_Outputs", "PredictedSetPlots"))

## STEP 1. Import the required files

Files generated from Module 1 are imported. All lipids in files generated in Module 1 are used for Training Dataset. Users are then prompted to submit an excel file listing lipids which they wish to predict for. The excel file must contain only 1 column, with column heading (hence, cell A1 of excel sheet will be ignored). For convenience, user is advised to use the provided template to generate their list of lipids. 

In [None]:
meanAllsigma = pd.read_excel('Module1_Outputs/meanAllsigma.xlsx')
CombinedAnalysis = pd.read_excel('Module1_Outputs/CombinedAnalysis.xlsx')
Intersected_gauss = pd.read_excel('Module1_Outputs/Intersected gauss.xlsx')

# Grab the Lipid List and turn to a list
Trainlist_Lipids = list(pd.unique(CombinedAnalysis['Lipid Species']))

# Output the number of unique lipid from this dataset
n_TrainLipids = len(Trainlist_Lipids)

# Talk to users
print("There are ", n_TrainLipids, "unique pairs of lipid isomers in your dataset." )
print("All of these pairs of lipid isomers will be used for training dataset.")

# Select excel file listing lipids to be predicted for.
## Files are read and stored as DataFrame.
root = tkinter.Tk()
InputFile3 = filedialog.askopenfile(parent=root,
                                   mode = 'rb',
                                   title = "Please choose .xlsx file listing Lipids to be predicted for.")
if InputFile3 != None:
    Predlist_Lipids = pd.read_excel(InputFile3) # Save as dataframe

root.withdraw()
root.update()

# Turn the only column in dataframe to list

Predlist_Lipids = Predlist_Lipids.iloc[:,0].to_list()

## STEP 2: Set up the files and seeding

Create Input DataSet from Trainlist_Lipids and Predlist_Lipids. This step will create excel sheets of TrainingSet and PredictionSet, of lipids to be predicted for defined by user above, and information associated to them obtained from files generated from Module 1.

In [None]:
# Create TrainingSet dataframe and TrainingSet.xlsx
TrainingSet = []
for i in range(len(Trainlist_Lipids)):
    temp_TrainingSet = CombinedAnalysis.loc[CombinedAnalysis['Lipid Species']==Trainlist_Lipids[i]]
    TrainingSet.append(temp_TrainingSet)
TrainingSet = pd.concat(TrainingSet)
TrainingSet.reset_index(drop = True, inplace = True)
TrainingSet = TrainingSet.drop(TrainingSet.columns[0], axis=1)


if_na = TrainingSet["mu_Glc"].isnull().values.any() or TrainingSet["mu_Gal"].isnull().values.any()
total_na = TrainingSet.isnull().sum().sum()

if if_na == True:
    print("You have NA for CoV values in your dataset. There are ", total_na, " instances of NA values.")
    print("At the SV where there are NAs for CoV values of either Glc or Gal isomer, the entire SV will be removed for that pair of lipid isomers.")
    print("TrainingSet is now generated without any NAs.")
    input("Please press 'Enter' to continue.")
    TrainingSet = TrainingSet.dropna()
    TrainingSet.reset_index(drop = True, inplace = True)

TrainingSet.to_excel('Module3_Outputs/TrainingSet.xlsx')

# Create PredictionSet dataframe and PredictionSet.xlsx. The lipids here were specified by users as Predlist_Lipids. The columns are
# empty and will be occupied with predicted value after running through the neural net.

PredictionSet = pd.DataFrame()                                                      
for lipid in range(len(Predlist_Lipids)):
    temp_PredictionSet = pd.DataFrame({'Lipid Species' : Predlist_Lipids[lipid],
                                       'SV' : meanAllsigma['SV'],
                                       'chain length' : ((Predlist_Lipids[lipid]).split(':'))[0],
                                       'degree of unsaturation' : ((Predlist_Lipids[lipid]).split(':'))[1]
                                       })  
    PredictionSet = pd.concat([PredictionSet, temp_PredictionSet])

PredictionSet['mu_Gal']  = ''
PredictionSet['mu_Glc']  = ''
PredictionSet['sigma_Gal']  = ''
PredictionSet['sigma_Glc']  = ''
PredictionSet.reset_index(drop = True, inplace = True)
PredictionSet.to_excel('Module3_Outputs/PredictionSet.xlsx')

# Seeding random number generator in tensorflow. Users can change this if wished to.
tn.random.set_seed(1)

## STEP 3. Standardize training dataset and set up hyperparameters

Training Dataset will be standardized for lipid features by performing z-scoring. Lipid features are lipids' chain length and degree of unsaturation.
Hyperparameters for the neural network are defined here based on previous optimization work. Hyperparameters may be changed by users if wished to, to test out other parameters applicable to neural networks.

In [None]:
# i. Set up the Trainining files and standardize training data input
(nInstances, nCols) = TrainingSet.shape
LipidFeatures = {'chain length', 'degree of unsaturation'}
nLipidFeatures= len(LipidFeatures)
LipidIsomers = {'Gal', 'Glc'}
nLipidIsomers = len(LipidIsomers)

## Set up file for X and Y variables of training dataset
TrainX = np.zeros(shape=(nInstances, nLipidFeatures+1)) #number of rows is taken from nInstances; number of cols is lipid features + SV
TrainY = np.zeros(shape=(nInstances, 2))

for i in range(nInstances):
    Temp = TrainingSet.loc[i,'Lipid Species']
    Temp2 = Temp.split(':')
    
    for j in range(nLipidFeatures):
        TrainX[i, j] = float(Temp2[j])
        
    TrainX[i, nLipidFeatures] = TrainingSet.loc[i,'SV']
    TrainY[i, 0] = TrainingSet.loc[i, 'mu_Gal']
    TrainY[i, 1] = TrainingSet.loc[i, 'mu_Glc']

## Standardize lipid features by z-scoring
FeatureMeans = np.mean(TrainX, axis=0)
FeatureSD = np.std(TrainX, axis=0)
for i in range(nLipidFeatures+1):
    TrainX[:,i] = TrainX[:,i]-FeatureMeans[i]
    TrainX[:,i] = TrainX[:,i]/FeatureSD[i]

# ii. Set up Prediction file and standardize input data required for prediction
(nPredInstances, nCols) = PredictionSet.shape
PredX = np.zeros(shape=(nPredInstances, nLipidFeatures+1)) #number of rows is taken from nPredInstances; number of cols is lipid features + SV

for i in range(nPredInstances):
    Temp = PredictionSet.loc[i,'Lipid Species']
    Temp2 = Temp.split(':')
    
    for j in range(nLipidFeatures):
        PredX[i, j] = float(Temp2[j])
        PredX[i, nLipidFeatures] = PredictionSet.loc[i,'SV']      
    
## Standardize lipid features by z-scoring
for i in range(nLipidFeatures+1):
    PredX[:,i] = PredX[:,i]-FeatureMeans[i]
    PredX[:,i] = PredX[:,i]/FeatureSD[i]

# iii. Hyperparameter Setup. Users can change if wished but iDMS work has shown that the followings are the most optimal parameters. 
Initializer = 'glorot_uniform'           #'he_uniform' 
HiddenLayers = 5
InnerActivation = 'softplus'
OuterActivation = 'linear'
LossFunction = 'MAE'
StopLoss = 1E-10

print("The optimal network used here has " +str(HiddenLayers)+ " layers, with " +InnerActivation+ " as activation function for all layers, " 
      +LossFunction+ " or 'mean absolute error' as the loss function, initialized weights basd on the " +Initializer+ 
      " method, and an early stopping loss of 1E-10 " )

parameters = Initializer + "_" + str(HiddenLayers) + "_" + InnerActivation + "_" + OuterActivation + "_" + LossFunction


## STEP 4. Model Initialization

Creating a neural network model based on the optimal hyperparameters

In [None]:
# Define the number of units in each hidden layer. This has been optimized and determined to be 10.
nUnitsPerHiddenLayer = 10

# Create the model
ourModel = models.Sequential()

# Add the hidden layers. The first layer input are lipid features (acyl chain, # of unsaturation) and SV.
ourModel.add(layers.Dense(units = nUnitsPerHiddenLayer, kernel_initializer = Initializer, input_dim = nLipidFeatures+1, 
                          activation = InnerActivation))

# Remaining hidden layers are similar, but have NUnitsPerHiddenLayer input dimension
for i in range(HiddenLayers-1):
    ourModel.add(layers.Dense(units = nUnitsPerHiddenLayer, kernel_initializer = Initializer, input_dim = nUnitsPerHiddenLayer, 
                              activation = InnerActivation))

# Create Output layer with two output units (Glc Peak CoV(or Glc_mu), and Gal Peak CoV(or Gal_mu))
ourModel.add(layers.Dense(units = 2, kernel_initializer = Initializer, input_dim = nUnitsPerHiddenLayer, 
                          activation = OuterActivation))

# Add loss function and optimizer to our model
ourModel.compile(optimizer = 'adam', loss = LossFunction)

# Train our model on the training dataset
history = ourModel.fit(TrainX, TrainY, epochs=2000)

## STEP 5. Prediction

This code block calculates predictions for the PredictionInputs and outputs them to file PredictionSet

In [None]:
# Make the predictions
PredY = ourModel.predict(PredX)

# Insert into dataframe
PredictionSet.loc[:,'mu_Gal'] = PredY[:,0]
PredictionSet.loc[:,'mu_Glc'] = PredY[:,1]

# Compile model asking for accuracy
ourModel.compile(loss = 'MAE', optimizer = 'adam', metrics = ['accuracy'])

# Evaluate the model. Score output is [the loss (MAE), accuracy]
score = ourModel.evaluate(TrainX, TrainY, verbose =0)

 ## STEP 6. From predicted value, apply Gaussian functions, predict separation and construct confusion matrix

In [None]:
# Add sigma values from mean sigma sheet
minSV = meanAllsigma['SV'].min()
maxSV = meanAllsigma['SV'].max()
stepSV = int((maxSV - minSV) / ((meanAllsigma['SV'].nunique())-1))

# Create a dictionary of the mean sigmas
Gal_sigma_dict = pd.Series(meanAllsigma.sigma_Gal.values, index = meanAllsigma.SV).to_dict()
Glc_sigma_dict = pd.Series(meanAllsigma.sigma_Glc.values, index = meanAllsigma.SV).to_dict()

# Combine
PredictionSet["sigma_Gal"] = PredictionSet["SV"].map(Gal_sigma_dict)
PredictionSet["sigma_Glc"] = PredictionSet["SV"].map(Glc_sigma_dict)

# Save it
PredictionSet.to_excel("Module3_Outputs/PredictionSet_"+parameters+".xlsx",index=False)

# Apply Gaussian functions
## Copy dataframe from PredictionSet and add a new column for climax intensity
Gauss_PredictionSet = PredictionSet.copy()

# Predict Intersection Point by Sigma and Gaussian
pred_Intersected_gauss = pd.DataFrame()

def gaussian(x, mu, sigma):
   y = 1 * exp(-0.5*((x-mu)/sigma)**2)  #Y=Amplitude*exp(-0.5*((X-Mean)/SD)^2)
   return y

def solve(mu_Gal,mu_Glc,sig_Gal,sig_Glc): 
  a = 1/(2*sig_Gal**2) - 1/(2*sig_Glc**2)
  b = mu_Glc/(sig_Glc**2) - mu_Gal/(sig_Gal**2)
  c = mu_Gal**2 /(2*sig_Gal**2) - mu_Glc**2 / (2*sig_Glc**2) - np.log(sig_Glc/sig_Gal)
  return np.roots([a,b,c])

for i in range(len(Predlist_Lipids)):
    pred_intersect_temp = (Gauss_PredictionSet[Gauss_PredictionSet['Lipid Species']==Predlist_Lipids[i]])
    
    minSV = pred_intersect_temp['SV'].min()
    maxSV = pred_intersect_temp['SV'].max()
    stepSV = int((maxSV - minSV) / ((pred_intersect_temp['SV'].nunique())-1))
    
       
    for SV in range(minSV, maxSV+stepSV, stepSV):
        pred_intersectSV_temp = pred_intersect_temp[pred_intersect_temp['SV']==SV]
        
        if pred_intersectSV_temp["mu_Gal"].notna() is False:
            continue
        else:
            mu_Gal = float(pred_intersectSV_temp["mu_Gal"])
            sig_Gal = float(pred_intersectSV_temp["sigma_Gal"])
        
        if pred_intersectSV_temp["mu_Glc"].notna() is False:
            continue
        else:
            mu_Glc = float(pred_intersectSV_temp["mu_Glc"])
            sig_Glc = float(pred_intersectSV_temp["sigma_Glc"])
        
        pred_gauss_features = [mu_Gal, mu_Glc, sig_Gal, sig_Glc] 
     
        if any(pd.isna(pred_gauss_features)) is True:
            continue
        else:
            pred_resultCombined = solve(mu_Gal, mu_Glc, sig_Gal, sig_Glc)
        
        
        x = np.linspace(-10,10,10000) #np.linspace(mu_Gal - 3*sig_Gal, mu_Gal + 3*sig_Gal, 100)
        y_Gal_pred = gaussian(x, mu_Gal, sig_Gal)
        y_Glc_pred = gaussian(x, mu_Glc, sig_Glc)
        
        # Intersection point is limited to only that lies between mu of Glc and Gal
        mu_range_max = max(mu_Glc, mu_Gal)
        mu_range_min = min(mu_Glc, mu_Gal)
        
        pred_intersect = max([point for point in pred_resultCombined if mu_range_min < point < mu_range_max], default = "NAN")
        
        if type(pred_intersect) is str:
            y_at_intersect = pred_intersect            
        else:
            y_at_intersect = gaussian(pred_intersect, mu_Gal, sig_Gal)
          
        if pred_intersect != "NAN" and (y_at_intersect < 0.5 ):
           pred_intersectSV_temp["Valley CoV"] = pred_intersect
       
        pred_intersectSV_temp["Intersection point"] = pred_intersect
        pred_intersectSV_temp["Norm Intensity at Intersection"] = y_at_intersect
         
         
        ax = []
        ax = plt.gca()
        ax.set_xlim([-10, 10])

        #plt.figure()
        titlelipid = 'Gaussian distribution plot based on predicted mu and sigma of \n GlcCer(d18:1/' + str(Predlist_Lipids[i]) + ' and GalCer(d18:1/' + str(Predlist_Lipids[i]) + ') at SV ' + str(SV)
        plt.ylabel('Predicted Normalized Signal Intensity')
        plt.xlabel('CoV')
        plt.title(titlelipid)
        plot1 = plt.plot(x, y_Gal_pred, color='green', label='GalCer(d18:1/' + str(Predlist_Lipids[i]) + ')' )
        plot2 = plt.plot(x, y_Glc_pred, color='blue', label='GlcCer(d18:1/' + str(Predlist_Lipids[i]) + ')' )
         
        if pred_intersect != "NAN":
          plot3 = plt.plot(pred_intersect, y_at_intersect, 'ro', fillstyle='none', label = 'Intersection point')
          
        plt.legend(loc='best')
        temp_plot = plt.gcf()
        plt.show()
        plt.draw()
        
        plotname = 'Gaussian_Glc' + str(Predlist_Lipids[i]) + '_Gal' + str(Predlist_Lipids[i]) + '_SV' + str(SV)
        plotname = plotname.replace(":","-")
        plotpath = os.path.join(currentpath, 'Module3_Outputs/PredictedSetPlots',plotname)
        temp_plot.savefig(plotpath)
            
        pred_Intersected_gauss = pd.concat([pred_Intersected_gauss, pred_intersectSV_temp])
        pred_Intersected_gauss.reset_index(drop = True, inplace = True)

plt.close()
pred_Intersected_gauss["Lipid Species"] = pred_Intersected_gauss['chain length'].astype(str) + ':' + pred_Intersected_gauss['degree of unsaturation'].astype(str)

# Binary Separation Column

pred_Intersected_gauss['Sep or InSep'] = np.where(pred_Intersected_gauss['Valley CoV'].isnull(), "Inseparable", "Separable")

        
# Output intersection point to files
pred_Intersected_gauss = pred_Intersected_gauss.replace('NAN','')
pred_Intersected_gauss.to_excel('Module3_Outputs/PredictionResult.xlsx')

In [None]:
print("Finished Module 3.")