<h1>CNN Using CWT ProcessedSpectra as Inputs</h1>

<p>This notebook is used to train and explore a CNN model that classifies minerals by taking in spectra that have undergone continuous wavelet transformation using a ricker wavelet</p>

In [1]:
import numpy as np
import pandas as pd
import cnnmodelcwt as cnn_cwt
from tensorflow.keras.models import Model

<h1>Use this space below for fiddling with model training</h1>

<p>If you need to retrain the model, be sure to restart the runtime</p>

<h3>Define your Hyperparameters below</h3>

In [2]:
'''define the hyperparameters here!'''
learning_rate=0.001
batch_size=100
drop_rate=0.55
epochs=50

#don't change the line below
hyperparameters=[learning_rate,batch_size,drop_rate,epochs]

In [3]:
'''define other parameters here!'''
#this string is prepended to all output files for identification
id_value='new_test'

#training data file path (do not include /content/drive/My Drive/ML Spectroscopy/)
fin_path=r'Data/CWT Data/Single/'

#output file path for the trained model data
mout_path=r'Model Data/CNN Model/CWT/'

#proportion of the training data allocated to the dev set - a float x: 0<x<1
dev_size=0.2

#random seed used for train/dev split to ensure repeatable outcomes. None 
#provides a psuedorandom value that won't be repeatable
r_state=1

#fast - True for doing quick training that does not output probability weights
#for each sample during training. False for doing deep analysis and tracking
fast=True

#threshold - float in the range [0.0,1.0) determines the discrimination threshold
threshold=0.5


In [4]:
'''train the model'''
cnn_model=cnn_cwt.cwt_cnn_model(fin_path,mout_path,dev_size,r_state,hyperparameters,fast,id_value,threshold=threshold)

Master data set shape is (71674, 951) 

 Master data set is
                                150       151       152       153       154  \
albite_train_015s_5250-0 -0.060426  0.364363  0.655384  0.748199  0.667854   
albite_train_015s_5250-1 -0.050979  0.370427  0.661060  0.755651  0.676321   
albite_train_015s_5250-2 -0.065293  0.378984  0.688549  0.793750  0.715698   
albite_train_015s_5250-3 -0.054342  0.375322  0.671679  0.768352  0.688044   
albite_train_015s_5250-4 -0.054718  0.385046  0.688442  0.786872  0.702929   
...                            ...       ...       ...       ...       ...   
qtz_train_015s_625-620    0.003787  0.043815  0.070439  0.077455  0.067289   
qtz_train_015s_625-621    0.004155  0.043077  0.068879  0.075580  0.065538   
qtz_train_015s_625-622    0.003540  0.042567  0.068623  0.075576  0.065634   
qtz_train_015s_625-623    0.003601  0.043714  0.070567  0.077919  0.068031   
qtz_train_015s_625-624    0.004089  0.042137  0.067314  0.073799  0.063936   

  



Unnamed: 0,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,pred_7,pred_8,pred_9,pred_10,pred_11,pred_12,pred_13,pred_14,pred_15
true_0,1987,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
true_1,0,1184,0,0,0,0,0,0,0,0,0,0,0,0,0,0
true_2,1,0,1015,2,0,0,0,0,0,1,0,0,0,0,0,6
true_3,0,0,0,1439,0,0,0,0,0,0,0,0,0,0,0,1
true_4,0,0,0,0,1291,0,0,0,0,0,0,0,0,0,0,0
true_5,0,0,0,0,0,996,0,0,0,0,0,0,0,0,0,0
true_6,0,0,0,0,0,0,1156,0,0,0,0,0,0,0,0,0
true_7,0,0,0,0,0,0,0,1115,0,0,0,0,0,0,0,0
true_8,0,0,0,0,0,0,0,0,1162,0,0,0,0,0,0,2
true_9,0,0,0,0,0,0,0,0,0,1032,0,0,0,0,0,1


# **Test the model! Outside of function scope!**

In [None]:

#test data path - do not include /content/.../ML Spectroscopy/
testin_path=r'Data/Preprocessed/Continuous Wavelet Transformation/Test Set/Labeled/'

#build test set, drops samples with Null values
test_df=h.dfbuilder(testin_path,synth=synth,split_df=False,use_trash=trash)
test_df.dropna(inplace=True)

#separates features and labels
y_test=test_df[test_df.columns[-1]]
X_test=test_df.drop(test_df.columns[-1],axis=1).values

roc_out=h.roc_all(cnn_model.predict(X_test),y_test)

if not fast:
  roc_out.to_csv(mout_path+id_value+'roc_data.csv')

#tests model, you can also use model.predict outside of this scope
cnn_cwt.test_cnn_model(cnn_model,X_test,y_test,id_value)

#this is that old folder we were talking about where CNN.csv files get saved, i just commented it out and pointed it to the folder with the rest of the outputs
#saved_data=r'/content/drive/My Drive/ML Spectroscopy/Programs/Data Processing/Saved Lists/'
saved_data=r'/content/drive/My Drive/ML Spectroscopy/Model Data/CNN Model/'
pd.DataFrame(data=cnn_model.predict(X_test),index=test_df.index.values).to_csv(saved_data+id_value+r'CNN.csv')


# Play space for whatever other training/testing you want to do


Testing/Training Parameters

In [None]:

'''define the hyperparameters here!'''
learning_rate=0.001
batch_size=100
drop_rate=0.55
epochs=50

#don't change the line below
hyperparameters=[learning_rate,batch_size,drop_rate,epochs]


In [None]:

'''define other parameters here!'''
#this string is prepended to all output files for identification
id_value='multi_size_granite_only_update_with_probs'

#training data file path (do not include /content/drive/My Drive/ML Spectroscopy/)
fin_path=r'Data/Preprocessed/Continuous Wavelet Transformation/Labeled/'

#output file path for the trained model data
mout_path=r'Model Data/CNN Model/'

#proportion of the training data allocated to the dev set - a float x: 0<x<1
dev_size=0.2

#random seed used for train/dev split to ensure repeatable outcomes. None 
#provides a psuedorandom value that won't be repeatable
r_state=1

#fast - True for doing quick training that does not output probability weights
#for each sample during training. False for doing deep analysis and tracking
fast=False

#trash - True for using trash data in the model
trash=False

#threshold - float in the range [0.0,1.0) determines the discrimination threshold
threshold=0.0


Build a test set with out of class samples (RRUFF untrained data) and our in class test data set

In [None]:
"""
#test data path - do not include /content/.../ML Spectroscopy/
testin_path1=r'Data/Preprocessed/Continuous Wavelet Transformation/Test Set/Labeled/'

#build test set, drops samples with Null values
test_df1=h.dfbuilder(testin_path1,synth=synth,split_df=False)
test_df1.dropna(inplace=True)


#test data path - do not include /content/.../ML Spectroscopy/
testin_path2=r'Data/Preprocessed/Continuous Wavelet Transformation/Test Set/RRUFF_untrained/Labeled/'

#build test set, drops samples with Null values
test_df2=h.dfbuilder(testin_path2,synth=True,split_df=False)
test_df2.dropna(inplace=True)

test_df=pd.concat([test_df1,test_df2])

test_df.loc[test_df['label']==9]=6

y_test=test_df[test_df.columns[-1]]
X_test=test_df.drop(test_df.columns[-1],axis=1).values
"""


Build test set with just granite data

In [None]:

#test data path - do not include /content/.../ML Spectroscopy/
testin_path=r'Data/Preprocessed/Continuous Wavelet Transformation/Test Set/Labeled/'

#build test set, drops samples with Null values
test_df=h.dfbuilder(testin_path,synth=False,split_df=False)
test_df.dropna(inplace=True)

print(test_df)
y_test=test_df[test_df.columns[-1]]
X_test=test_df.drop(test_df.columns[-1],axis=1).values


Running model with limited data sets

In [None]:

fin_path=r'Data/Preprocessed/Continuous Wavelet Transformation/Labeled/Limited Sizes/'

for i in [1,2,4,8,16,32,64,128,256,512,1024,2048,4096]:
  #adjusting some parameters for the size
  hyperparameters[1]=max([1,round(((i*9*4)/5)/411)])
  temp_id=id_value+str(i)
  
  print('\n\n\n\n', 'Model ',i,'\n')
  cnn_model=cwt_cnn_model(fin_path+str(i)+'/',mout_path,dev_size,r_state,hyperparameters,fast,temp_id,use_trash=trash,threshold=threshold)
  print('\nTest CM\n')
  cnn_cwt.test_cnn_model(cnn_model,X_test,y_test,temp_id,test=True,threshold=threshold,fast=fast)



In [None]:
#print(roc_out)

In [None]:
"""
opt_thresh=np.array((roc_out.columns.values.astype(float)))
for i in range(len(roc_out.columns)):
  tpr_ls,fpr_ls,thresh_ls=roc_out[roc_out.columns.values[i]]
  #print(np.subtract(tpr_ls,fpr_ls))
  opt_thresh[i]=thresh_ls[np.argmax(np.subtract(tpr_ls,fpr_ls))]
  print('Optimal Threshold for class:',roc_out.columns.values[i],'\n',opt_thresh[i])
print(opt_thresh.sum()*(1.0/len(opt_thresh)))
"""

In [None]:
"""
#tests model, you can also use model.predict outside of this scope
cnn_cwt.test_cnn_model(cnn_model,X_test,y_test,id_value,.0,fast=True)
"""

In [None]:
#report metrics: precision, recall, f1-score, support for test set
from sklearn.metrics import classification_report
print(classification_report(y_test,cnn_cwt.dec_pred(cnn_model.predict(X_test),threshold)))
