In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install xgboost



In [3]:
!pip install imblearn



In [4]:
# Basic data handling libraries
import numpy as np
import pandas as pd
np.random.seed(1)

# Cross validation and hyperparameter tuning libraries
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
from sklearn import decomposition
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

from tensorflow.keras.utils import to_categorical
from keras import regularizers
from keras.layers import Input, Conv1D,MaxPooling1D, Flatten,AveragePooling1D
from keras.layers import Conv2D, GlobalMaxPooling1D, BatchNormalization
from keras.layers.core import Reshape
from keras.layers import Dense, Dropout
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix
from tensorflow.keras.models import Model

# Machine learning classifiers
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

import warnings
warnings.filterwarnings('ignore')

In [5]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()

In [6]:
def CNNFeatureExtractor(num_features,num_labels):

  inputs = Input(shape=(num_features,1))

  conv1 = Conv1D(32,3,activation='relu',padding='same',kernel_initializer='random_normal')(inputs)
  conv1 = Conv1D(32,3,activation='relu',padding='same',kernel_initializer='random_normal')(conv1)
  conv1 = BatchNormalization()(conv1)
  maxpool1 = MaxPooling1D(3)(conv1)

  conv2 = Conv1D(64,3,activation='relu',padding='same',kernel_initializer='random_normal')(maxpool1)
  conv2 = Conv1D(64,3,activation='relu',padding='same',kernel_initializer='random_normal')(conv2)
  conv2 = BatchNormalization()(conv2)
  maxpool2 = MaxPooling1D(3)(conv2)

  conv3 = Conv1D(128,3,activation='relu',padding='same',kernel_initializer='random_normal')(maxpool2)
  conv3 = Conv1D(128,3,activation='relu',padding='same',kernel_initializer='random_normal')(conv3)
  conv3 = BatchNormalization()(conv3)
  maxpool3 = MaxPooling1D(3)(conv3)
  
  conv4 = Conv1D(256,3,activation='relu',padding='same',kernel_initializer='random_normal')(maxpool3)
  conv4 = Conv1D(256,3,activation='relu',padding='same',kernel_initializer='random_normal')(conv4)
  conv4 = BatchNormalization()(conv4)

  dropout1 = Dropout(0.5)(conv4)

  maxpool4 = MaxPooling1D(3)(dropout1)

  flatten = Flatten()(maxpool4)
  
  dense1 = Dense(1024, activation = 'relu')(flatten)
  dropout2 = Dropout(0.2)(dense1)
  dense2 = Dense(128, activation = 'relu')(dense1)
  dropout3 = Dropout(0.2)(dense2)
  dense3 = Dense(num_labels,activation='softmax')(dropout3)
  
  model_without_classifier = Model(inputs=inputs,outputs=flatten)
  model = Model(inputs=inputs,outputs=dense3)

  model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
  
  return model,model_without_classifier

In [7]:
def train_model_for_a_class(x,Label,K,use_smote):
  cvscores_cnn = []
  cvscores_xgboost = []
  avg_score_cnn = 0.0
  avg_score_xgboost = 0.0
  kfold = StratifiedKFold(n_splits=K, shuffle=True, random_state=42)

  for train, test in kfold.split(x, Label):
      scl = StandardScaler()

      if use_smote:
        X_train,y_train = smote.fit_resample(x[train],Label[train])
      else:
        X_train,y_train = x[train],Label[train]

      scl = scl.fit(X_train)
      X_train = scl.transform(X_train)
      X_test = scl.transform(x[test])
      y_test = Label[test]

      y_train_categorical = to_categorical(y_train,num_classes=len(np.unique(Label)))
      y_test_categorical = to_categorical(y_test,num_classes=len(np.unique(Label)))
      
      num_features = X_train.shape[1]
      # print("Num features:{}".format(num_features))

      model, feature_extractor = CNNFeatureExtractor(num_features, len(np.unique(Label)))
      
      y_pred = model(X_train)
      # print(y_pred.shape)
      # print(y_train_categorical.shape)

      model.fit(X_train, y_train_categorical, batch_size=32 , epochs=50 , validation_data=(X_test, y_test_categorical),verbose=0)

      score = model.evaluate(X_test, y_test_categorical, verbose = 0)
      cvscores_cnn.append(score[1])
      avg_score_cnn = avg_score_cnn + score[1]

      xgboost_x_train = feature_extractor(X_train)
      xgboost_x_test = feature_extractor(X_test)

      # print(xgboost_x_train.shape)
      # print(xgboost_x_test.shape)

      if len(np.unique(Label)) == 2:
        model = XGBClassifier(objective = "binary:logistic",seed=42)
      else:
        model = XGBClassifier(objective = "multi:softmax",seed=42)

      model.fit(xgboost_x_train, y_train)
      score = model.score(xgboost_x_test, y_test)
      cvscores_xgboost.append(score)
      avg_score_xgboost = avg_score_xgboost + score

  avg_score_cnn = avg_score_cnn/K
  avg_score_xgboost = avg_score_xgboost/K
  return avg_score_cnn, cvscores_cnn, avg_score_xgboost, cvscores_xgboost

In [8]:
def train_model(data_file_path,use_smote=False): 

  data = np.loadtxt(data_file_path, delimiter=",")

  num_samples = data.shape[1]
  num_features = data.shape[0]-3

  x = np.transpose(data[0:num_features:])
  Label_1 = np.transpose(data[num_features:num_features+1,:]); Label_1 = Label_1.astype(int);
  Label_2 = np.transpose(data[num_features+1:num_features+2,:]); Label_2 = Label_2.astype(int);
  Label_3 = np.transpose(data[num_features+2:num_features+3,:]); Label_3 = Label_3.astype(int);

  # print("Number of points in the dataset: {}".format(num_samples))
  # print("Number of features in each datapoint: {}\n".format(num_features))

  # print("Training model for 2 class")
  # avg_score_2_class_cnn, cvscores_2_class_cnn, avg_score_2_class_xgboost, cvscores_2_class_xgboost = train_model_for_a_class(x,Label_1,5,use_smote)
  # print("Results CNN:\n Cross validation scores: {} \n Average accuracy: {}\n".format(cvscores_2_class_cnn, avg_score_2_class_cnn))
  # print("Results XGBoost:\n Cross validation scores: {} \n Average accuracy: {}\n".format(cvscores_2_class_xgboost, avg_score_2_class_xgboost))

  # print("Training model for 4 class")
  # avg_score_4_class_cnn, cvscores_4_class_cnn, avg_score_4_class_xgboost, cvscores_4_class_xgboost = train_model_for_a_class(x,Label_2,5,use_smote)
  # print("Results CNN:\n Cross validation scores: {} \n Average accuracy: {}\n".format(cvscores_4_class_cnn, avg_score_4_class_cnn))
  # print("Results XGBoost:\n Cross validation scores: {} \n Average accuracy: {}\n".format(cvscores_4_class_xgboost, avg_score_4_class_xgboost))

  print("Training model for 10 class")
  avg_score_10_class_cnn, cvscores_10_class_cnn, avg_score_10_class_xgboost, cvscores_10_class_xgboost = train_model_for_a_class(x,Label_3,5,use_smote)
  print("Results CNN:\n Cross validation scores: {} \n Average accuracy: {}\n".format(cvscores_10_class_cnn, avg_score_10_class_cnn))
  print("Results XGBoost:\n Cross validation scores: {} \n Average accuracy: {}\n".format(cvscores_10_class_xgboost, avg_score_10_class_xgboost))

In [None]:
train_model("/content/drive/MyDrive/CEERI Project - Kalit/data/L=1e7/L+H/M=1024/data.csv",use_smote=True)

Number of points in the dataset: 227
Number of features in each datapoint: 1026

Training model for 2 class
Results CNN:
 Cross validation scores: [0.97826087474823, 1.0, 0.7111111283302307, 1.0, 0.35555556416511536] 
 Average accuracy: 0.8089855134487152

Results XGBoost:
 Cross validation scores: [1.0, 0.9565217391304348, 1.0, 1.0, 1.0] 
 Average accuracy: 0.9913043478260869

Training model for 4 class
Results CNN:
 Cross validation scores: [0.97826087474823, 0.54347825050354, 0.7111111283302307, 0.7111111283302307, 0.7555555701255798] 
 Average accuracy: 0.7399033904075623

Results XGBoost:
 Cross validation scores: [0.9782608695652174, 0.9782608695652174, 0.9555555555555556, 1.0, 1.0] 
 Average accuracy: 0.9824154589371981

Training model for 10 class
Results CNN:
 Cross validation scores: [0.41304346919059753, 0.45652174949645996, 0.4000000059604645, 0.5777778029441833, 0.35555556416511536] 
 Average accuracy: 0.4405797183513641

Results XGBoost:
 Cross validation scores: [0.91304

In [None]:
train_model("/content/drive/MyDrive/CEERI Project - Kalit/data/L=1e7/L+H/M=2048/data.csv",use_smote=True)

Number of points in the dataset: 227
Number of features in each datapoint: 2050

Training model for 2 class
Results CNN:
 Cross validation scores: [1.0, 0.3913043439388275, 1.0, 0.6000000238418579, 0.5111111402511597] 
 Average accuracy: 0.7004831016063691

Results XGBoost:
 Cross validation scores: [1.0, 1.0, 1.0, 1.0, 1.0] 
 Average accuracy: 1.0

Training model for 4 class
Results CNN:
 Cross validation scores: [0.47826087474823, 0.6521739363670349, 0.6222222447395325, 0.6222222447395325, 0.6222222447395325] 
 Average accuracy: 0.5994203090667725

Results XGBoost:
 Cross validation scores: [1.0, 1.0, 1.0, 0.9777777777777777, 1.0] 
 Average accuracy: 0.9955555555555555

Training model for 10 class
Results CNN:
 Cross validation scores: [0.3695652186870575, 0.3695652186870575, 0.4444444477558136, 0.42222222685813904, 0.4000000059604645] 
 Average accuracy: 0.4011594235897064

Results XGBoost:
 Cross validation scores: [0.9347826086956522, 0.9130434782608695, 0.9777777777777777, 0.9555

In [None]:
train_model("/content/drive/MyDrive/CEERI Project - Kalit/data/L=1e7/L+H/M=4096/data.csv",use_smote=True)

Number of points in the dataset: 227
Number of features in each datapoint: 4098

Training model for 2 class
Results CNN:
 Cross validation scores: [1.0, 0.8260869383811951, 1.0, 1.0, 1.0] 
 Average accuracy: 0.965217387676239

Results XGBoost:
 Cross validation scores: [1.0, 1.0, 1.0, 1.0, 1.0] 
 Average accuracy: 1.0

Training model for 4 class
Results CNN:
 Cross validation scores: [0.6739130616188049, 0.6304348111152649, 0.7111111283302307, 0.3777777850627899, 0.7555555701255798] 
 Average accuracy: 0.6297584712505341

Results XGBoost:
 Cross validation scores: [0.9782608695652174, 1.0, 1.0, 1.0, 1.0] 
 Average accuracy: 0.9956521739130434

Training model for 10 class
Results CNN:
 Cross validation scores: [0.45652174949645996, 0.3913043439388275, 0.5333333611488342, 0.4888888895511627, 0.4444444477558136] 
 Average accuracy: 0.4628985583782196

Results XGBoost:
 Cross validation scores: [0.9565217391304348, 0.9130434782608695, 1.0, 0.9777777777777777, 1.0] 
 Average accuracy: 0.969

In [None]:
train_model("/content/drive/MyDrive/CEERI Project - Kalit/data/L=1e6/L+H/M=1024/data.csv",use_smote=True)

Number of points in the dataset: 2270
Number of features in each datapoint: 1026

Training model for 2 class
Results CNN:
 Cross validation scores: [1.0, 1.0, 1.0, 1.0, 1.0] 
 Average accuracy: 1.0

Results XGBoost:
 Cross validation scores: [1.0, 1.0, 1.0, 1.0, 1.0] 
 Average accuracy: 1.0

Training model for 4 class
Results CNN:
 Cross validation scores: [0.8480176329612732, 0.9096916317939758, 0.8832598924636841, 0.9030836820602417, 0.7224669456481934] 
 Average accuracy: 0.8533039569854737

Results XGBoost:
 Cross validation scores: [1.0, 0.9933920704845814, 0.9977973568281938, 0.9977973568281938, 0.9933920704845814] 
 Average accuracy: 0.9964757709251101

Training model for 10 class
Results CNN:
 Cross validation scores: [0.7400881052017212, 0.7290748953819275, 0.7268722653388977, 0.8215858936309814, 0.7202643156051636] 
 Average accuracy: 0.7475770950317383

Results XGBoost:
 Cross validation scores: [0.9889867841409692, 0.9911894273127754, 0.9779735682819384, 0.986784140969163, 

In [None]:
train_model("/content/drive/MyDrive/CEERI Project - Kalit/data/L=1e6/L+H/M=2048/data.csv",use_smote=True)

Number of points in the dataset: 2270
Number of features in each datapoint: 2050

Training model for 2 class
Results CNN:
 Cross validation scores: [1.0, 1.0, 1.0, 1.0, 1.0] 
 Average accuracy: 1.0

Results XGBoost:
 Cross validation scores: [1.0, 1.0, 0.9955947136563876, 1.0, 1.0] 
 Average accuracy: 0.9991189427312775

Training model for 4 class
Results CNN:
 Cross validation scores: [0.8854625821113586, 0.9273127913475037, 0.9052863717079163, 0.8766520023345947, 0.8061674237251282] 
 Average accuracy: 0.8801762342453003

Results XGBoost:
 Cross validation scores: [0.9977973568281938, 0.9933920704845814, 0.9977973568281938, 1.0, 0.9977973568281938] 
 Average accuracy: 0.9973568281938325

Training model for 10 class
Results CNN:
 Cross validation scores: [0.6277533173561096, 0.6519823670387268, 0.7026431560516357, 0.7488986849784851, 0.7268722653388977] 
 Average accuracy: 0.691629958152771

Results XGBoost:
 Cross validation scores: [0.9823788546255506, 0.9669603524229075, 0.98678414

In [None]:
train_model("/content/drive/MyDrive/CEERI Project - Kalit/data/L=1e6/L+H/M=4096/data.csv",use_smote=True)

Number of points in the dataset: 2270
Number of features in each datapoint: 4098

Training model for 2 class
Results CNN:
 Cross validation scores: [1.0, 1.0, 1.0, 1.0, 1.0] 
 Average accuracy: 1.0

Results XGBoost:
 Cross validation scores: [0.9977973568281938, 1.0, 0.9977973568281938, 1.0, 1.0] 
 Average accuracy: 0.9991189427312775

Training model for 4 class
Results CNN:
 Cross validation scores: [0.8898678421974182, 0.8964757919311523, 0.933920681476593, 0.9383260011672974, 0.9052863717079163] 
 Average accuracy: 0.9127753376960754

Results XGBoost:
 Cross validation scores: [0.9977973568281938, 0.9955947136563876, 0.9977973568281938, 0.9977973568281938, 0.9977973568281938] 
 Average accuracy: 0.9973568281938325

Training model for 10 class


KeyboardInterrupt: ignored

In [9]:
train_model("/content/drive/MyDrive/CEERI Project - Kalit/data/L=1e6/L+H/M=4096/data.csv",use_smote=True)

Training model for 10 class
Results CNN:
 Cross validation scores: [0.5, 0.5462555289268494, 0.6321585774421692, 0.7114537358283997, 0.8193832635879517] 
 Average accuracy: 0.641850221157074

Results XGBoost:
 Cross validation scores: [0.9845814977973568, 0.9823788546255506, 0.9911894273127754, 0.9669603524229075, 0.9823788546255506] 
 Average accuracy: 0.9814977973568283

