In [1]:
import numpy as np                                     # linear algebra
import pandas as pd                                    # data processing, CSV file I/O (e.g. pd.read_csv)
import copy                                            #to copy list
from sklearn.model_selection import train_test_split   #to split dataset into train and test set
from sklearn.svm import SVC                            #to create svc instance
from sklearn.metrics import classification_report      #to create report for precision,recall,f1-score,accuracy
from sklearn import metrics                            #to get accuracy
from sklearn.model_selection import GridSearchCV       #to optimise the hyper-parameter
from sklearn.ensemble import RandomForestClassifier    #to create rf instance

In [2]:
df = pd.read_csv('Dataset1.csv')
df.head()

Unnamed: 0,pdb_id,chain_code,seq,sst8,sst3,len,has_nonstd_aa
0,1A30,C,EDL,CBC,CEC,3,False
1,1B05,B,KCK,CBC,CEC,3,False
2,1B0H,B,KAK,CBC,CEC,3,False
3,1B1H,B,KFK,CBC,CEC,3,False
4,1B2H,B,KAK,CBC,CEC,3,False


In [3]:
#For SVC classifier
maxlen_seq = 10
input_seqs_svc, target_seqs_svc = df[['seq', 'sst8']][(df.len <= maxlen_seq) & (~df.has_nonstd_aa)].values.T
#input_grams = seq2ngrams(input_seqs)
print("Total SVC Dataset :",len(input_seqs_svc))
print(input_seqs_svc[0:5])
print(" ")

#For random forest classifier
input_seqs_rf, target_seqs_rf = df[['seq', 'sst8']][(df.len > maxlen_seq) & (~df.has_nonstd_aa)].values.T
print("Total RF Dataset :",len(input_seqs_rf))
print(input_seqs_rf[0:5])

Total SVC Dataset : 3140
['EDL' 'KCK' 'KAK' 'KFK' 'KAK']
 
Total RF Dataset : 383193
['EPQYEEIPIYL' 'EPQYEEIPIYL' 'IKENLKDCGLF' 'GVQSLKRRRCF' 'SVLYTAVQPNE']


In [4]:
print(target_seqs_svc[0:5])
print(target_seqs_svc.size)

print(target_seqs_rf[0:5])
print(target_seqs_rf.size)

['CBC' 'CBC' 'CBC' 'CBC' 'CBC']
3140
['CCSSCCCCCCC' 'CCSSCCSCCCC' 'CCHHHHTTTCC' 'CCCEEEEEEEC' 'CCCBCCEECCC']
383193


In [5]:
for row in range(len(target_seqs_svc)):
    secondary_lenth_svc = len(target_seqs_svc[row])
    primary_lenth_svc = len(input_seqs_svc[row])
    
    if(secondary_lenth_svc != primary_lenth_svc):
        print("(",row,") Secondary_Structure ->", target_seqs_svc[row]," Primary_Structure -> ",input_seqs_svc[row])
    
    
for row in range(len(target_seqs_rf)):
    secondary_lenth_rf = len(target_seqs_rf[row])
    primary_lenth_rf = len(input_seqs_rf[row])
    
    if(secondary_lenth_rf != primary_lenth_rf):
        print("(",row,") Secondary_Structure ->", target_seqs_rf[row]," Primary_Structure -> ",input_seqs_rf[row])

In [6]:
secondary_count_svc = 0
primary_count_svc = 0
for row in range(len(target_seqs_svc)):
    secondary_lenth_svc = len(target_seqs_svc[row])
    primary_lenth_svc = len(input_seqs_svc[row])
    secondary_count_svc = secondary_count_svc + secondary_lenth_svc
    primary_count_svc = primary_count_svc + primary_lenth_svc
    if(secondary_lenth_svc != primary_lenth_svc):
        print("(",row,") Secondary_Structure ->", target_seqs_svc[row]," Primary_Structure -> ",input_seqs_svc[row])
        
print("count of secondary structure for SVC : ",secondary_count_svc)
print("count of primary structure for SVC : ",primary_count_svc)
print(" ")


secondary_count_rf = 0
primary_count_rf = 0
for row in range(len(target_seqs_rf)):
    secondary_lenth_rf = len(target_seqs_rf[row])
    primary_lenth_rf = len(input_seqs_rf[row])
    secondary_count_rf = secondary_count_rf + secondary_lenth_rf
    primary_count_rf = primary_count_rf + primary_lenth_rf
    if(secondary_lenth_rf != primary_lenth_rf):
        print("(",row,") Secondary_Structure ->", target_seqs_rf[row]," Primary_Structure -> ",input_seqs_rf[row])
        
print("count of secondary structure for RF : ",secondary_count_rf)
print("count of primary structure for RF : ",primary_count_rf)

count of secondary structure for SVC :  24906
count of primary structure for SVC :  24906
 
count of secondary structure for RF :  101297641
count of primary structure for RF :  101297641


In [7]:
def split(sequence): 
    return [char for char in sequence]

In [8]:
primary_split_svc = []
secondary_split_svc = []
svc_sz=0
for row in range(int(len(target_seqs_svc))):
    primary_split_svc.append(split(input_seqs_svc[row]))
    secondary_split_svc.append(split(target_seqs_svc[row]))
    svc_sz=max(svc_sz,len(input_seqs_svc[row]))

print("Number of Dataset for SVC :",len(secondary_split_svc))
print("Maximum length of Data for SVC :",svc_sz)
print(" ")


primary_split_rf = []
secondary_split_rf = []
rf_sz=0
for row in range(int(len(target_seqs_rf)/80)):
    primary_split_rf.append(split(input_seqs_rf[row]))
    secondary_split_rf.append(split(target_seqs_rf[row]))
    rf_sz=max(rf_sz,len(input_seqs_rf[row]))
              
print("Number of Dataset for RF :",len(secondary_split_rf))
print("Maximum length of Data for RF :",rf_sz)

Number of Dataset for SVC : 3140
Maximum length of Data for SVC : 10
 
Number of Dataset for RF : 4789
Maximum length of Data for RF : 20


In [9]:
def orthogonal_primary(arg):
    switch = {
        'A' : np.array([1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]),  # 20 amino acids
        'C' : np.array([0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]),
        'E' : np.array([0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]),
        'D' : np.array([0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]),
        'G' : np.array([0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]),
        'F' : np.array([0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0]),
        'I' : np.array([0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0]),
        'H' : np.array([0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0]),
        'K' : np.array([0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0]),
        'M' : np.array([0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0]),
        'L' : np.array([0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0]),
        'N' : np.array([0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0]),
        'Q' : np.array([0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]),
        'P' : np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0]),
        'S' : np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0]),
        'R' : np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0]),
        'T' : np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0]),
        'W' : np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0]),
        'V' : np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]),
        'Y' : np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1])
    }
    
    return switch.get(arg)

def orthogonal_secondary(arg):
    switch = {
        'H' : 0,                    # H= α-helix
        'C' : 1,                    # C= Loops and irregular elements
        'E' : 2,                    # E= β-strand
        'B' : 3,                    # B= β-bridge
        'G' : 4,                    # G= 3-helix
        'I' : 5,                    # I= π-helix
        'T' : 6,                    # T= Turn
        'S' : 7                     # S= Bend
    }
    
    return switch.get(arg)

In [10]:
for row in range(len(primary_split_svc)):
    sequence = primary_split_svc[row]
    for col in range(len(sequence)):
        sequence[col] = orthogonal_primary(sequence[col])
        
for row in range(len(primary_split_rf)):
    sequence = primary_split_rf[row]
    for col in range(len(sequence)):
        sequence[col] = orthogonal_primary(sequence[col])

In [11]:
for row in range(len(secondary_split_svc)):  
    sequenceS = secondary_split_svc[row]
    for col in range(len(sequenceS)):
        sequenceS[col] = orthogonal_secondary(sequenceS[col])
        
        
for row in range(len(secondary_split_rf)):  
    sequenceS = secondary_split_rf[row]
    for col in range(len(sequenceS)):
        sequenceS[col] = orthogonal_secondary(sequenceS[col])

In [12]:
primary_split_svc[0:5]

[[array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])],
 [array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])],
 [array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])],
 [array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])],
 [array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  

In [13]:
primary_split_rf[0:5]

[[array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]),
  array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]),
  array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]),
  array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]),
  array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]),
  array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])],
 [array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]),
  array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]),
  arr

In [14]:
secondary_split_svc[0:5]

[[1, 3, 1], [1, 3, 1], [1, 3, 1], [1, 3, 1], [1, 3, 1]]

In [15]:
secondary_split_rf[0:5]

[[1, 1, 7, 7, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 7, 7, 1, 1, 7, 1, 1, 1, 1],
 [1, 1, 0, 0, 0, 0, 6, 6, 6, 1, 1],
 [1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1],
 [1, 1, 1, 3, 1, 1, 2, 2, 1, 1, 1]]

In [16]:
def graph_sum2(seq1,seq2):
    result = [None]*len(seq1)
    for col in range(len(seq1)):
        result[col] =  seq1[col]+seq2[col]
    return result

def graph_sum3(seq1,seq2,seq3):
    result = [None]*len(seq1)
    for col in range(len(seq1)):
        result[col] =  seq1[col]+seq2[col]+seq3[col]
    return result

In [17]:
#Building the graph neural netword for svc
graph_input_svc = copy.deepcopy(primary_split_svc)
for row in range(len(primary_split_svc)):
    sequence = primary_split_svc[row]
    graph_input_svc[row][0]=graph_sum2(sequence[0],sequence[1])
    graph_input_svc[row][len(sequence)-1]=graph_sum2(sequence[len(sequence)-1],sequence[len(sequence)-2])
    for col in range(1,len(sequence)-1):
        graph_input_svc[row][col] = graph_sum3(sequence[col-1],sequence[col],sequence[col+1])
        
graph_input_svc[0:5]

[[[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
 [[0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
 [[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
 [[0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
 [[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]]

In [18]:
#Building the graph neural netword for RF
graph_input_rf = copy.deepcopy(primary_split_rf)
for row in range(len(primary_split_rf)):
    sequence = primary_split_rf[row]
    graph_input_rf[row][0]=graph_sum2(sequence[0],sequence[1])
    graph_input_rf[row][len(sequence)-1]=graph_sum2(sequence[len(sequence)-1],sequence[len(sequence)-2])
    for col in range(1,len(sequence)-1):
        graph_input_rf[row][col] = graph_sum3(sequence[col-1],sequence[col],sequence[col+1])
        
graph_input_rf[0:5]

[[[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
  [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1],
  [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1],
  [0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
  [0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
  [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1]],
 [[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
  [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1],
  [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1],
  [0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [19]:
def targetY(data_list):
    Y = []
    for i in range(len(data_list)):
        for j  in range(len(data_list[i])):
            Y.append(data_list[i][j])
    return Y

In [20]:
y_label_svc = targetY(secondary_split_svc)
y_label_rf = targetY(secondary_split_rf)

In [21]:
print(len(y_label_svc))
print(y_label_svc[0:5])
print(len(y_label_rf))
print(y_label_rf[0:5])

24906
[1, 3, 1, 1, 3]
68123
[1, 1, 7, 7, 1]


In [22]:
def window_padding_data(size, sequence):
    num = int(size/2)
    for i in range(5):
        print("Before :",len(sequence[i]))
    print("")
    zeros = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
    for i in range(len(sequence)):
        temp=[]
        for j in range(num):
            sequence[i].append(zeros)
            sequence[i].insert(0, zeros)

    for i in range(5):
        print("After :",len(sequence[i]))
    X = []
    temp = []

    for k in range(len(sequence)):
        #print(sequence[k])
        for l in range(len(sequence[k])-(size-1)):
            temp = sequence[k][l:l+size]
           # print(temp)
            X.append(temp)
            temp = []

    return X

In [23]:
X_svc = window_padding_data(11,graph_input_svc)
print(len(X_svc))
X_svc[0:5]

Before : 3
Before : 3
Before : 3
Before : 3
Before : 3

After : 13
After : 13
After : 13
After : 13
After : 13


[[array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])],
 [array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  array([0, 0, 0, 0, 0, 0,

In [24]:
X_rf = window_padding_data(11,graph_input_rf)
print(len(X_rf))
X_rf[0:5]

Before : 11
Before : 11
Before : 11
Before : 11
Before : 11

After : 21
After : 21
After : 21
After : 21
After : 21


[[array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
  [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1],
  [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1],
  [0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
  [0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
 [array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [25]:
np.set_printoptions(threshold=np.inf)
X_svc = np.array(X_svc)
y_label_svc = np.array(y_label_svc)
X_svc = X_svc.reshape(len(X_svc),11*20)
print(X_svc[0:5])

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1
  0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0
  0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 

In [26]:
X_rf = np.array(X_rf)
y_label_rf = np.array(y_label_rf)
X_rf = X_rf.reshape(len(X_rf),11*20)
print(X_rf[0:5])

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
  0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1
  0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 2 0 0 0 1 0 0 0 0 0 0 0 0 0
  0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0
  0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 1 0
  0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
  0 0 2 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0
  0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 

In [27]:
print(len(X_svc))
print(len(y_label_svc))

24906
24906


In [180]:
#split the dataset into train set and test set
x_train_svc, x_test_svc, y_train_svc, y_test_svc = train_test_split(X_svc, y_label_svc, test_size = 0.20,random_state=54)

x_train_rf, x_test_rf, y_train_rf, y_test_rf = train_test_split(X_rf, y_label_rf, test_size = 0.20,random_state=54)

In [181]:
import time
start_time = time.time()
#predicting the smaller length dataset by svc
svc = SVC(kernel='rbf', gamma = 0.1, C=1.5)
svc.fit(x_train_svc, y_train_svc)
y_pred_svc = svc.predict(x_test_svc)

#predicting the larger length dataset by random forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(x_train_rf, y_train_rf)
y_pred_rf=rf.predict(x_test_rf)

print("--- %s seconds ---" % (time.time() - start_time))
print(" ")

#calculating the accuracy by ensembling the result
y_test=[]
for i in range(len(y_test_svc)):
    y_test.append(y_test_svc[i])
for i in range(len(y_test_rf)):
    y_test.append(y_test_rf[i])
y_test=np.array(y_test)
    
y_pred = []
for i in range(len(y_pred_svc)):
    y_pred.append(y_pred_svc[i])
for i in range(len(y_pred_rf)):
    y_pred.append(y_pred_rf[i])
y_pred=np.array(y_pred)
    
y_true = []
for i in range(len(y_test_svc)):
    y_true.append(y_test_svc[i])
for i in range(len(y_test_rf)):
    y_true.append(y_test_rf[i])
y_true=np.array(y_true)


print("Accuracy = ",metrics.accuracy_score(y_test, y_pred)*100)
print(classification_report(y_true,y_pred)) 

--- 192.83130168914795 seconds ---
 
Accuracy =  86.00526683506207
              precision    recall  f1-score   support

           0       0.88      0.87      0.88      2608
           1       0.87      0.96      0.91     11198
           2       0.89      0.79      0.84      1935
           3       0.81      0.54      0.65       376
           4       0.69      0.59      0.64       342
           6       0.69      0.44      0.54       941
           7       0.75      0.54      0.63      1207

    accuracy                           0.86     18607
   macro avg       0.80      0.68      0.73     18607
weighted avg       0.85      0.86      0.85     18607

