# Pre-Processing


## importing necessary modules

In [1]:
!pip install biopython

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m


## Loading libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from sklearn.metrics import accuracy_score
from Bio.Align import AlignInfo
from Bio import AlignIO


## getting the data

In [3]:
training = pd.read_csv('mlba_train.csv') # Edit the path for your data here
validation = pd.read_csv('valid.csv')

In [4]:
training.describe()  #describing the dataset to find anomaies if any

Unnamed: 0,ID,Type,Sequence
count,3049,3049,3049
unique,3049,2,3048
top,D2750,NDNA,MKIIAYGARVDEIQYFKQWAKDTGNTLEYHTEFLDENTVEWAKGFD...
freq,1,1750,2


### deleting non unique sequence

In [5]:
training.drop(2119,inplace=True)
training.reset_index(inplace=True)

In [6]:
training.head()

Unnamed: 0,index,ID,Type,Sequence
0,0,D1001,DNA,EPATILLIDDHPMLRTGVKQLISMAPDITVVGEASNGEQGIELAES...
1,1,D1005,DNA,MKRESHKHAEQARRNRLAVALHELASLIPAEWKQQNVSAAPSKATT...
2,2,D1008,DNA,RPYACPVESCDRRFSRSADLTRHIRIHTG
3,3,D1010,DNA,GPYLVIVEQPKQRGFRFRYGCEGPSHGGLPGASSEKGRKTYPTVKI...
4,4,D1015,DNA,ALTNAQILAVIDSWEETVGQFPVITHHVPLGGGLQGTLHCYEIPLA...


### changing labels to integer type for Algo to understand

In [7]:
training.loc[training[' Type'] == 'DNA', ' Type'] = 1
training.loc[training[' Type'] == 'NDNA', ' Type'] = -1
training.head()

Unnamed: 0,index,ID,Type,Sequence
0,0,D1001,1,EPATILLIDDHPMLRTGVKQLISMAPDITVVGEASNGEQGIELAES...
1,1,D1005,1,MKRESHKHAEQARRNRLAVALHELASLIPAEWKQQNVSAAPSKATT...
2,2,D1008,1,RPYACPVESCDRRFSRSADLTRHIRIHTG
3,3,D1010,1,GPYLVIVEQPKQRGFRFRYGCEGPSHGGLPGASSEKGRKTYPTVKI...
4,4,D1015,1,ALTNAQILAVIDSWEETVGQFPVITHHVPLGGGLQGTLHCYEIPLA...


### coverting amino acid composition from given sequence

In [8]:
import collections

def merge_dicts(dicts):
    merged = collections.defaultdict(list)
    for d in dicts:
        for k, v in d.items():
            merged[k].append(v)
    return dict(merged)

seq_dictionary = []
for i,sequence in enumerate(training[' Sequence']):
  X = ProteinAnalysis(sequence)
  seq_dictionary.append(X.get_amino_acids_percent())

merged = merge_dicts(seq_dictionary)
percent_comp = pd.DataFrame(merged)
percent_comp.head()

Unnamed: 0,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y
0,0.094203,0.0,0.07971,0.094203,0.007246,0.072464,0.021739,0.050725,0.043478,0.173913,0.043478,0.028986,0.050725,0.021739,0.028986,0.065217,0.043478,0.072464,0.0,0.007246
1,0.174603,0.015873,0.0,0.079365,0.0,0.015873,0.063492,0.031746,0.063492,0.079365,0.015873,0.047619,0.031746,0.079365,0.095238,0.079365,0.047619,0.047619,0.015873,0.015873
2,0.068966,0.068966,0.068966,0.034483,0.034483,0.034483,0.068966,0.068966,0.0,0.034483,0.0,0.0,0.068966,0.0,0.206897,0.103448,0.068966,0.034483,0.0,0.034483
3,0.054348,0.021739,0.027174,0.065217,0.027174,0.086957,0.032609,0.043478,0.081522,0.092391,0.027174,0.021739,0.076087,0.059783,0.070652,0.081522,0.038043,0.070652,0.0,0.021739
4,0.061728,0.049383,0.024691,0.030864,0.024691,0.117284,0.067901,0.037037,0.024691,0.08642,0.0,0.061728,0.080247,0.049383,0.04321,0.030864,0.067901,0.08642,0.030864,0.024691


In [9]:
training.tail()

Unnamed: 0,index,ID,Type,Sequence
3043,3044,N5408,-1,MANMLYFSLLALLFMTGIASEGTISSGLASLKAKIDAKRPSGKQLF...
3044,3045,N5410,-1,MTFRKSFDCYDFYDRAKVGEKCTQDDWDLMKIPMKAMELKQKYGLD...
3045,3046,N5412,-1,MSPYFKLSSALIFLAITMEALCSPIENTSTSNKDNDKETEHIEISA...
3046,3047,N5414,-1,MGRICPVNSRARRLRARPGRPSGDSLPYHQLQGGAPRLWSPDPGRP...
3047,3048,N5416,-1,MYSFVSEETGTLIVNSVLLFLAFVVFLLVTLAILTALRLCAYCCNI...


In [10]:
final_training = pd.concat([training,percent_comp], axis = 1)
final_training.head()

Unnamed: 0,index,ID,Type,Sequence,A,C,D,E,F,G,...,M,N,P,Q,R,S,T,V,W,Y
0,0,D1001,1,EPATILLIDDHPMLRTGVKQLISMAPDITVVGEASNGEQGIELAES...,0.094203,0.0,0.07971,0.094203,0.007246,0.072464,...,0.043478,0.028986,0.050725,0.021739,0.028986,0.065217,0.043478,0.072464,0.0,0.007246
1,1,D1005,1,MKRESHKHAEQARRNRLAVALHELASLIPAEWKQQNVSAAPSKATT...,0.174603,0.015873,0.0,0.079365,0.0,0.015873,...,0.015873,0.047619,0.031746,0.079365,0.095238,0.079365,0.047619,0.047619,0.015873,0.015873
2,2,D1008,1,RPYACPVESCDRRFSRSADLTRHIRIHTG,0.068966,0.068966,0.068966,0.034483,0.034483,0.034483,...,0.0,0.0,0.068966,0.0,0.206897,0.103448,0.068966,0.034483,0.0,0.034483
3,3,D1010,1,GPYLVIVEQPKQRGFRFRYGCEGPSHGGLPGASSEKGRKTYPTVKI...,0.054348,0.021739,0.027174,0.065217,0.027174,0.086957,...,0.027174,0.021739,0.076087,0.059783,0.070652,0.081522,0.038043,0.070652,0.0,0.021739
4,4,D1015,1,ALTNAQILAVIDSWEETVGQFPVITHHVPLGGGLQGTLHCYEIPLA...,0.061728,0.049383,0.024691,0.030864,0.024691,0.117284,...,0.0,0.061728,0.080247,0.049383,0.04321,0.030864,0.067901,0.08642,0.030864,0.024691


### drop sequence column

In [11]:
X = final_training.drop([' Sequence','ID',' Type','index'],axis=1)
y = pd.DataFrame(final_training[' Type'],dtype=np.float32)

In [None]:
X.head()

In [None]:
y.head()

## split the training data set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,)

In [None]:
X_train.describe()

# Processing

## trying SVM

In [None]:
svm_classifier = SVC()
svm_classifier.fit(X_train,y_train)

In [None]:
y_pred_svm = svm_classifier.predict(X_test)

## trying k- neighborhood

In [None]:
knn_classifier = KNeighborsClassifier(n_neighbors=2)
knn_classifier.fit(X_train,y_train)

In [None]:
y_pred_knn = knn_classifier.predict(X_test)

## trying naive bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
nb_classifier = GaussianNB()
nb_classifier.fit(X_train,y_train)

In [None]:
y_pred_nb = nb_classifier.predict(X_test) 

# Post-Processing

In [None]:
print(f"SVM accuracy :{accuracy_score(y_test, y_pred_svm)}")

In [None]:
print(f"Naive Bayes accuracy :{accuracy_score(y_test, y_pred_nb)}")

In [None]:
print(f"KNN accuracy :{accuracy_score(y_test, y_pred_knn)}")

### Working on test data

In [None]:
validation.head()

In [None]:
print(validation[' Sequence'].shape)

### Converting sequence to amino acid composition

In [None]:
seq_dictionary_valid = []
for i,sequence in enumerate(validation[' Sequence']):
  X = ProteinAnalysis(sequence)
  seq_dictionary_valid.append(X.get_amino_acids_percent())

merged_validation = merge_dicts(seq_dictionary_valid)
percent_comp_validation = pd.DataFrame(merged_validation)
percent_comp_validation.head()

In [None]:
final_validation = pd.concat([validation,percent_comp_validation], axis = 1)
print(final_validation.shape)
final_validation.head()

In [None]:
X_val = final_validation.drop([' Sequence','ID'],axis=1)
X_val.head()

### Prediction generation

In [None]:
y_pred_svm_valid = svm_classifier.predict(X_val).astype(int)

### Saving the output file to csv

In [None]:
def get_label_csv(predictions,name_of_file):
  labels = pd.DataFrame({'Lable':predictions})
  final_output = pd.concat([final_validation['ID'],labels],axis=1)
  final_output.to_csv(name_of_file,index=False)

In [None]:
get_label_csv(y_pred_svm_valid,'svm_result_valid.csv')

In [None]:
y_pred_knn_valid = knn_classifier.predict(X_val)

In [None]:
get_label_csv(y_pred_knn_valid,'knn_result_valid.csv')

In [None]:
y_pred_nb_val = nb_classifier.predict(X_val)

In [None]:
get_label_csv(y_pred_nb_val,'nb_result_valid.csv')