<h1>CNN Using Raw Spectra as Inputs</h1>

<p>This notebook is used to train and explore a CNN model that classifies minerals by taking in raw spectra</p>

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras as K
from tensorflow.keras.models import Model

In [2]:
import dev_helper as h
import cnnmodel_raw as cnn_raw

# **Use this space below for fiddling with model training**

### **If you need to retrain the model, be sure to restart the runtime**

In [3]:
'''define the hyperparameters here!'''
learning_rate=0.0001
batch_size=100
drop_rate=0.7
epochs=50

#don't change the line below
hyperparameters=[learning_rate,batch_size,drop_rate,epochs]

In [4]:
'''define other parameters here!'''
#this string is prepended to all output files for unique identification
id_value='test_1'

#training data file path
fin_path=r'Data/Raw Data/'

#output file path for the trained model data
mout_path=r'Model Data/CNN Model/'

#proportion of the training data allocated to the dev set - a float x: 0<x<1
dev_size=0.2

#random seed used for train/dev split to ensure repeatable outcomes. None 
#provides a psuedorandom value that won't be repeatable
r_state=1

#fast - True for doing quick training that does not output probability weights
#for each sample during training. False for doing deep analysis and tracking
fast=True

#trash - True for using trash data in the model
trash=False

#synth - True for RRUFF Data
synth=False

#threshold - float in the range [0.0,1.0) determines the discrimination threshold
threshold=0.5


In [5]:
'''train the model'''
cnn_model=cnn_raw.raw_cnn_model(fin_path,mout_path,dev_size,r_state,hyperparameters,
                          fast,id_value,use_trash=trash,threshold=threshold,raw=True)

                                150      151      152      153      154  \
mc_train_015s_5600-0        2966.89  2967.19  2987.83  3008.36  3018.51   
mc_train_015s_5600-1        2592.87  2599.11  2610.90  2617.07  2623.17   
mc_train_015s_5600-2        2538.25  2539.41  2554.91  2567.23  2564.36   
mc_train_015s_5600-3        2216.50  2201.93  2203.68  2218.75  2232.76   
mc_train_015s_5600-4        2116.35  2108.02  2111.91  2141.61  2164.73   
...                             ...      ...      ...      ...      ...   
qtzalb_train_60s_3000-2995  4493.94  4513.53  4572.71  4602.57  4638.74   
qtzalb_train_60s_3000-2996  4305.30  4345.35  4428.01  4455.20  4480.20   
qtzalb_train_60s_3000-2997  4588.90  4632.70  4707.32  4725.48  4743.04   
qtzalb_train_60s_3000-2998  4165.44  4215.77  4286.21  4312.87  4342.73   
qtzalb_train_60s_3000-2999  4293.21  4330.90  4394.62  4421.67  4447.26   

                                155      156      157      158      159  ...  \
mc_train_015s_5600-

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).

# **Test the model! Outside of function scope!**

In [None]:

#test data path - do not include /content/.../ML Spectroscopy/
testin_path=r'Data/Raw Data/Labeled Test/'

#build test set, drops samples with Null values
test_df=h.dfbuilder(testin_path,synth=False,split_df=False,use_trash=False,raw=False,test=True)


#separates features and labels
y_test=test_df[test_df.columns[-1]]
X_test=test_df.drop(test_df.columns[-1],axis=1).values

#roc_out=h.roc_all(cnn_model.predict(X_test),y_test)

if not fast:
  roc_out.to_csv(mout_path+id_value+'roc_data.csv')

#tests model, you can also use model.predict outside of this scope
cnn.test_cnn_model(cnn_model,X_test,y_test,id_value,threshold=0.0)

#this is that old folder we were talking about where CNN.csv files get saved, i just commented it out and pointed it to the folder with the rest of the outputs
#saved_data=r'/content/drive/My Drive/ML Spectroscopy/Programs/Data Processing/Saved Lists/'
saved_data=r'/content/drive/My Drive/ML Spectroscopy/Model Data/CNN Model/'
pd.DataFrame(data=cnn_model.predict(X_test),index=test_df.index.values).to_csv(saved_data+id_value+r'CNN.csv')


# Play space for whatever other training/testing you want to do


Testing/Training Parameters

In [None]:

'''define the hyperparameters here!'''
learning_rate=0.001
batch_size=100
drop_rate=0.55
epochs=50

#don't change the line below
hyperparameters=[learning_rate,batch_size,drop_rate,epochs]


In [None]:

'''define other parameters here!'''
#this string is prepended to all output files for identification
id_value='multi_size_granite_only_update_with_probs'

#training data file path (do not include /content/drive/My Drive/ML Spectroscopy/)
fin_path=r'Data/Preprocessed/Continuous Wavelet Transformation/Labeled/'

#output file path for the trained model data
mout_path=r'Model Data/CNN Model/'

#proportion of the training data allocated to the dev set - a float x: 0<x<1
dev_size=0.2

#random seed used for train/dev split to ensure repeatable outcomes. None 
#provides a psuedorandom value that won't be repeatable
r_state=1

#fast - True for doing quick training that does not output probability weights
#for each sample during training. False for doing deep analysis and tracking
fast=False

#trash - True for using trash data in the model
trash=False

#threshold - float in the range [0.0,1.0) determines the discrimination threshold
threshold=0.0


Build a test set with out of class samples (RRUFF untrained data) and our in class test data set

In [None]:

#test data path
testin_path1=r'Data/Preprocessed/Continuous Wavelet Transformation/Test Set/Labeled/'

#build test set, drops samples with Null values
test_df1=h.dfbuilder(testin_path1,synth=synth,split_df=False)
test_df1.dropna(inplace=True)


#test data path - do not include /content/.../ML Spectroscopy/
testin_path2=r'Data/Preprocessed/Continuous Wavelet Transformation/Test Set/RRUFF_untrained/Labeled/'

#build test set, drops samples with Null values
test_df2=h.dfbuilder(testin_path2,synth=True,split_df=False)
test_df2.dropna(inplace=True)

test_df=pd.concat([test_df1,test_df2])

test_df.loc[test_df['label']==9]=6

y_test=test_df[test_df.columns[-1]]
X_test=test_df.drop(test_df.columns[-1],axis=1).values



Build test set with just granite data

In [None]:

#test data path - do not include /content/.../ML Spectroscopy/
testin_path=r'Data/Preprocessed/Continuous Wavelet Transformation/Test Set/Labeled/'

#build test set, drops samples with Null values
test_df=h.dfbuilder(testin_path,synth=False,split_df=False)
test_df.dropna(inplace=True)

print(test_df)
y_test=test_df[test_df.columns[-1]]
X_test=test_df.drop(test_df.columns[-1],axis=1).values


Running model with limited data sets

In [None]:

fin_path=r'Data/Preprocessed/Continuous Wavelet Transformation/Labeled/Limited Sizes/'

for i in [1,2,4,8,16,32,64,128,256,512,1024,2048,4096]:
  #adjusting some parameters for the size
  hyperparameters[1]=max([1,round(((i*9*4)/5)/411)])
  temp_id=id_value+str(i)
  
  print('\n\n\n\n', 'Model ',i,'\n')
  cnn_model=cwt_cnn_model(fin_path+str(i)+'/',mout_path,dev_size,r_state,hyperparameters,fast,temp_id,use_trash=trash,threshold=threshold)
  print('\nTest CM\n')
  cnn_cwt.test_cnn_model(cnn_model,X_test,y_test,temp_id,test=True,threshold=threshold,fast=fast)



In [None]:
#print(roc_out)

In [None]:
"""
opt_thresh=np.array((roc_out.columns.values.astype(float)))
for i in range(len(roc_out.columns)):
  tpr_ls,fpr_ls,thresh_ls=roc_out[roc_out.columns.values[i]]
  #print(np.subtract(tpr_ls,fpr_ls))
  opt_thresh[i]=thresh_ls[np.argmax(np.subtract(tpr_ls,fpr_ls))]
  print('Optimal Threshold for class:',roc_out.columns.values[i],'\n',opt_thresh[i])
print(opt_thresh.sum()*(1.0/len(opt_thresh)))
"""

In [None]:
"""
#tests model, you can also use model.predict outside of this scope
cnn_cwt.test_cnn_model(cnn_model,X_test,y_test,id_value,.0,fast=True)
"""

In [None]:
#report metrics: precision, recall, f1-score, support for test set
from sklearn.metrics import classification_report
print(classification_report(y_test,cnn_cwt.dec_pred(cnn_model.predict(X_test),threshold)))


In [None]:
'''
#Used this to label the raw test data using the labels from the CWT test data
fin_path='Data/Raw Data/'
use_trash=False
raw=True
test=True

test_raw=h.dfbuilder(fin_path=fin_path,directory=directory,split_df=False,use_trash=use_trash,raw=raw,test=test)

fin_path='Data/Preprocessed/Continuous Wavelet Transformation/Test Set/Labeled/'
use_trash=False
raw=False
test=True

test_labeled=h.dfbuilder(fin_path=fin_path,directory=directory,split_df=False,use_trash=use_trash,raw=raw,test=test)

ignore_ls
final_df=pd.DataFrame()
for idx,row in test_raw.iterrows():
  try:
    row['label']=test_labeled.loc[idx,'label']
    final_df[idx]=row
  except:
    ignore_ls.append(idx)
final_df_t=final_df.transpose()
display(final_df_t)

final_df_t.to_csv(directory+'Data/Raw Data/Labeled Test/L_raw_gabbro_granite_0dust_test_015s_937_4840.csv')
'''