This script will
1. Load data from a configuration
2. Pre-process data
3. Use ANNs to train
4. Save file back to google drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
!ls -lart /content/gdrive/MyDrive/GEM5_GLSVLSI_2022/GEM5-GLSVLSI_2022/

total 18632831
-rw------- 1 root root       1086 Feb  1 23:04 submit-convertGLVLSI.sh
-rw------- 1 root root       1147 Feb  2 02:26 convertGLVLSI.py
-rw------- 1 root root 1588901725 Feb  2 04:33 config_3p1_all.npz
-rw------- 1 root root 1587724566 Feb  2 04:43 config_3p2_all.npz
-rw------- 1 root root 1599821713 Feb  2 04:52 config_3p3_all.npz
-rw------- 1 root root 1586160348 Feb  2 05:02 config_3p4_all.npz
-rw------- 1 root root 1597561046 Feb  2 05:11 config_4p1_all.npz
-rw------- 1 root root 1584649757 Feb  2 05:21 config_4p2_all.npz
-rw------- 1 root root 1587983791 Feb  2 05:31 config_4p3_all.npz
-rw------- 1 root root 1588961462 Feb  2 05:40 config_4p4_all.npz
-rw------- 1 root root 1588669616 Feb  2 05:50 config_5p1_all.npz
-rw------- 1 root root 1588576891 Feb  2 05:59 config_5p2_all.npz
-rw------- 1 root root 1587796087 Feb  2 06:09 config_5p3_all.npz
-rw------- 1 root root 1593198510 Feb  2 06:19 config_5p4_all.npz
-rw------- 1 root root       6436 Feb  3 05:29 Example.ipy

In [None]:
import numpy as np
import pandas as pd

import tensorflow
import tensorflow.keras as keras
from keras.models import Sequential, load_model
#from keras.layers.core import Dense, Activation, Dropout ##This was for FNN before 19/11/21
from keras.layers import Dense, Activation, Dropout, LSTM, GRU, Bidirectional, Conv1D, MaxPooling1D, Flatten,AveragePooling1D
from keras.utils import np_utils
from keras.callbacks import CSVLogger, TensorBoard, ModelCheckpoint, EarlyStopping
from keras.layers import BatchNormalization

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import RobustScaler, StandardScaler

from sys import getsizeof

In [None]:
## Load the data
dirPath = "/content/gdrive/MyDrive/GEM5_GLSVLSI_2022/GEM5-GLSVLSI_2022/"
configNum = "3p1"
npzData = dirPath + "/config_" + configNum + "_all.npz"

##Note this should be "Xt", but since I am using lower sized RAM, 12Gb, I cannot load train which is 20Gb.
x_train = np.load(npzData)["Xd"]
y_train = np.load(npzData)["yd"]

##Convert y_train to one-hot
n_classes = 256
y_train_oh = np_utils.to_categorical(y_train, n_classes)

print("config= %s, x_train shape= %s, y_train_oh shape= %s" %(configNum, x_train.shape, y_train_oh.shape))

config= 3p1, x_train shape= (256000, 1500), y_train_oh shape= (256000, 256)


In [None]:

print("x_train size before pre-processing is %s Mb" %(x_trainOrig/1024/1024))
print("data type is %s" %(x_train.dtype))

[0.08567 0.09672 0.07224 ... 0.10435 0.16901 0.0953 ]
x_train size before pre-processing is 0.0001068115234375 Mb
data type is float64


In [None]:
## Standard Scaling is producing e-17 notations, causing the datasize to rise after processing
## this will supress the scientific notations, 
## this affects only the print options and not storage. 
np.set_printoptions(suppress=False)

In [None]:
##Pre-processing 

processingType = "stdScale" ##or stdScale

if(processingType == "robustScale"):
  transformer = RobustScaler().fit(x_train)
elif(processingType == "stdScale"):
  transformer = StandardScaler().fit(x_train)
else:
  print("Give correct prcessing type")

x_train = transformer.transform(x_train)
##Print may look like all 0, but we have non-zero values quite a bit
print(x_train[0])
print(x_train[0][x_train[0]!=0].shape)


[ 5.55111512e-17  5.55111512e-17 -2.77555756e-17 ... -4.16333634e-17
 -8.32667268e-17  4.16333634e-17]
(1455,)


In [None]:
x_trainPost = getsizeof(x_train)
print("x_train size after pre-processing is %s Mb" %(x_trainPost/1024/1024))

x_train size after pre-processing is 2929.6876068115234 Mb


In [None]:
#x_train[np.abs(x_train)<1e-7]=0
x_train[0,1200:1210]

array([-7.31541843e-01,  0.00000000e+00,  1.39345463e+00,  7.08905334e-01,
       -7.15802550e-01, -4.16333634e-17,  1.43449980e+00,  2.89459383e+00,
       -4.16333634e-17,  9.70438125e-01])

In [None]:
x_trainPost = getsizeof(x_train)
print("x_train size after pre-processing is %s Mb" %(x_trainPost/1024/1024))

x_train size after pre-processing is 2929.6876068115234 Mb


In [None]:
x_train.dtype

dtype('float64')

In [None]:
##Create ANNs
###----------------FNN---------------###
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(1500,)))
model.add(Dropout(0.1))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.1))

model.add(Dense(256, activation='softmax'))

###----------------FNN---------------###

###----------------CNN---------------###
###----------------CNN---------------###


###----------------RNN---------------###
###----------------RNN---------------###

In [None]:
print(model.summary())
model.compile(loss='categorical_crossentropy', metrics=['categorical_accuracy'], optimizer='adam')

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 512)               768512    
                                                                 
 dropout (Dropout)           (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               65664     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 256)               33024     
                                                                 
Total params: 867,200
Trainable params: 867,200
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
##Train the model, use CV for this
skf = StratifiedKFold(n_splits=3)
foldCount = 0
for train_idx, dev_idx in skf.split(x_train, y_train):
  foldCount = foldCount + 1
  print('##--------------Fold{}---------------##'.format(foldCount))
  history = model.fit(x_train[train_idx], y_train_oh[train_idx],
                      validation_data=(x_train[dev_idx], y_train_oh[dev_idx]),
                      batch_size=1024, epochs=5)
  
  ##Evaluate the fold, probably save the model, but will do it later
  score = model.evaluate(x=x_train[dev_idx], y=y_train_oh[dev_idx], batch_size=1024)
  print("Score of Fold%s, is %s" %(foldCount, score[1]))
  print('##--------------Fold{}---------------##\n\n'.format(foldCount))


##--------------Fold1---------------##
Epoch 1/5