<h1>CNN Using Raw Spectra as Inputs</h1>

<p>This notebook is used to train and explore a CNN model that classifies minerals by taking in raw spectra</p>

In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras as K
from tensorflow.keras.models import Model

In [4]:
import cnnmodel_raw as cnn_raw

# **Use this space below for fiddling with model training**

### **If you need to retrain the model, be sure to restart the runtime**

## **Define Hyperparameters below**

In [5]:
learning_rate=0.0001
batch_size=100
drop_rate=0.7
epochs=50

#don't change the line below
hyperparameters=[learning_rate,batch_size,drop_rate,epochs]

In [6]:
#this string is prepended to all output files for unique identification
id_value='test_1'

#training data file path
fin_path=r'Data/Raw Data/Single/'

#output file path for the trained model data
mout_path=r'Model Data/CNN Model/'

#proportion of the training data allocated to the dev set - a float x: 0<x<1
dev_size=0.2

#random seed used for train/dev split to ensure repeatable outcomes. None 
#provides a psuedorandom value that won't be repeatable
r_state=1

#fast - True for doing quick training that does not output probability weights
#for each sample during training. False for doing deep analysis and tracking
#note if fast is true, model will train in <5 min while false may be >30 min
fast=True

#threshold - float in the range [0.0,1.0) determines the discrimination threshold
threshold=0.5


In [7]:
'''train the model'''
cnn_model=cnn_raw.raw_cnn_model(fin_path,mout_path,dev_size,r_state,hyperparameters,
                          fast,id_value,threshold=threshold)

Master data set shape is (71674, 951) 

 Master data set is
                             150      151      152      153      154      155  \
an_train_015s_5600-0    1462.55  1464.53  1470.13  1472.67  1470.26  1467.80   
an_train_015s_5600-1    1384.08  1387.80  1396.29  1396.04  1373.02  1366.30   
an_train_015s_5600-2    1424.49  1415.20  1414.14  1417.03  1413.58  1410.43   
an_train_015s_5600-3    1542.68  1548.87  1537.69  1527.16  1522.17  1518.86   
an_train_015s_5600-4    1661.08  1649.81  1643.46  1645.90  1644.90  1640.17   
...                         ...      ...      ...      ...      ...      ...   
cal_train_1s_2300-2295  1313.30  1336.07  1397.53  1438.65  1462.09  1475.82   
cal_train_1s_2300-2296  1313.59  1332.70  1385.34  1437.70  1483.00  1480.94   
cal_train_1s_2300-2297  1339.32  1356.20  1416.57  1446.77  1477.90  1488.92   
cal_train_1s_2300-2298  1331.32  1356.17  1421.01  1460.85  1502.31  1482.43   
cal_train_1s_2300-2299  1339.50  1358.85  1416.45  1448.29 

Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50


FileNotFoundError: [Errno 2] No such file or directory: 'Model Data/CNN Model/test_1hist.csv'

# **Test the model! Outside of function scope!**

In [None]:

#test data path - do not include /content/.../ML Spectroscopy/
testin_path=r'Data/Raw Data/Labeled Test/'

#build test set, drops samples with Null values
test_df=h.dfbuilder(testin_path,synth=False,split_df=False,use_trash=False,raw=False,test=True)


#separates features and labels
y_test=test_df[test_df.columns[-1]]
X_test=test_df.drop(test_df.columns[-1],axis=1).values

#roc_out=h.roc_all(cnn_model.predict(X_test),y_test)

if not fast:
  roc_out.to_csv(mout_path+id_value+'roc_data.csv')

#tests model, you can also use model.predict outside of this scope
cnn.test_cnn_model(cnn_model,X_test,y_test,id_value,threshold=0.0)

#this is that old folder we were talking about where CNN.csv files get saved, i just commented it out and pointed it to the folder with the rest of the outputs
#saved_data=r'/content/drive/My Drive/ML Spectroscopy/Programs/Data Processing/Saved Lists/'
saved_data=r'/content/drive/My Drive/ML Spectroscopy/Model Data/CNN Model/'
pd.DataFrame(data=cnn_model.predict(X_test),index=test_df.index.values).to_csv(saved_data+id_value+r'CNN.csv')


# Play space for whatever other training/testing you want to do


Testing/Training Parameters

In [None]:

'''define the hyperparameters here!'''
learning_rate=0.001
batch_size=100
drop_rate=0.55
epochs=50

#don't change the line below
hyperparameters=[learning_rate,batch_size,drop_rate,epochs]


In [None]:

'''define other parameters here!'''
#this string is prepended to all output files for identification
id_value='multi_size_granite_only_update_with_probs'

#training data file path (do not include /content/drive/My Drive/ML Spectroscopy/)
fin_path=r'Data/Preprocessed/Continuous Wavelet Transformation/Labeled/'

#output file path for the trained model data
mout_path=r'Model Data/CNN Model/'

#proportion of the training data allocated to the dev set - a float x: 0<x<1
dev_size=0.2

#random seed used for train/dev split to ensure repeatable outcomes. None 
#provides a psuedorandom value that won't be repeatable
r_state=1

#fast - True for doing quick training that does not output probability weights
#for each sample during training. False for doing deep analysis and tracking
fast=False

#trash - True for using trash data in the model
trash=False

#threshold - float in the range [0.0,1.0) determines the discrimination threshold
threshold=0.0


Build a test set with out of class samples (RRUFF untrained data) and our in class test data set

In [None]:

#test data path
testin_path1=r'Data/Preprocessed/Continuous Wavelet Transformation/Test Set/Labeled/'

#build test set, drops samples with Null values
test_df1=h.dfbuilder(testin_path1,synth=synth,split_df=False)
test_df1.dropna(inplace=True)


#test data path - do not include /content/.../ML Spectroscopy/
testin_path2=r'Data/Preprocessed/Continuous Wavelet Transformation/Test Set/RRUFF_untrained/Labeled/'

#build test set, drops samples with Null values
test_df2=h.dfbuilder(testin_path2,synth=True,split_df=False)
test_df2.dropna(inplace=True)

test_df=pd.concat([test_df1,test_df2])

test_df.loc[test_df['label']==9]=6

y_test=test_df[test_df.columns[-1]]
X_test=test_df.drop(test_df.columns[-1],axis=1).values



Build test set with just granite data

In [None]:

#test data path - do not include /content/.../ML Spectroscopy/
testin_path=r'Data/Preprocessed/Continuous Wavelet Transformation/Test Set/Labeled/'

#build test set, drops samples with Null values
test_df=h.dfbuilder(testin_path,synth=False,split_df=False)
test_df.dropna(inplace=True)

print(test_df)
y_test=test_df[test_df.columns[-1]]
X_test=test_df.drop(test_df.columns[-1],axis=1).values


Running model with limited data sets

In [None]:

fin_path=r'Data/Preprocessed/Continuous Wavelet Transformation/Labeled/Limited Sizes/'

for i in [1,2,4,8,16,32,64,128,256,512,1024,2048,4096]:
  #adjusting some parameters for the size
  hyperparameters[1]=max([1,round(((i*9*4)/5)/411)])
  temp_id=id_value+str(i)
  
  print('\n\n\n\n', 'Model ',i,'\n')
  cnn_model=cwt_cnn_model(fin_path+str(i)+'/',mout_path,dev_size,r_state,hyperparameters,fast,temp_id,use_trash=trash,threshold=threshold)
  print('\nTest CM\n')
  cnn_cwt.test_cnn_model(cnn_model,X_test,y_test,temp_id,test=True,threshold=threshold,fast=fast)



In [None]:
#print(roc_out)

In [None]:
"""
opt_thresh=np.array((roc_out.columns.values.astype(float)))
for i in range(len(roc_out.columns)):
  tpr_ls,fpr_ls,thresh_ls=roc_out[roc_out.columns.values[i]]
  #print(np.subtract(tpr_ls,fpr_ls))
  opt_thresh[i]=thresh_ls[np.argmax(np.subtract(tpr_ls,fpr_ls))]
  print('Optimal Threshold for class:',roc_out.columns.values[i],'\n',opt_thresh[i])
print(opt_thresh.sum()*(1.0/len(opt_thresh)))
"""

In [None]:
"""
#tests model, you can also use model.predict outside of this scope
cnn_cwt.test_cnn_model(cnn_model,X_test,y_test,id_value,.0,fast=True)
"""

In [None]:
#report metrics: precision, recall, f1-score, support for test set
from sklearn.metrics import classification_report
print(classification_report(y_test,cnn_cwt.dec_pred(cnn_model.predict(X_test),threshold)))
