In [137]:
import pandas as pd
import seaborn as sb
import numpy as np
import warnings
warnings.simplefilter("ignore")
import sklearn.metrics
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Imputer
from sklearn.linear_model import LogisticRegression
import importlib.util
from sklearn.metrics import confusion_matrix
from collections import defaultdict

In [138]:
dataset=pd.read_csv("newHIV-1_data/1625Data.txt", delimiter=",",names=["Peptides", "Result"])
dataset.head()

Unnamed: 0,Peptides,Result
0,SLNLRETN,1
1,AECFRIFD,1
2,HLVEALYL,1
3,TQIMFETF,1
4,AEELAEIF,1


In [139]:
dataset.shape

(1625, 2)

In [140]:
dataset.columns

Index(['Peptides', 'Result'], dtype='object')

In [141]:
dataset.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Result,1625.0,-0.538462,0.842909,-1.0,-1.0,-1.0,-1.0,1.0


In [142]:
dataset.sample()

Unnamed: 0,Peptides,Result
1411,NVLPQGWK,-1


In [143]:
dataset.isnull().sum()

Peptides    0
Result      0
dtype: int64

In [144]:
# Seperate all amino acids
peptides = np.array([[dataset["Peptides"][i][j] for i in range(dataset.shape[0])] for j in range(8)])
peptides.shape

(8, 1625)

In [145]:
# Store the seperated amino acids into a dataframe
dataset1= pd.DataFrame(peptides.T, columns=list('ABCDEFGH'))
dataset1.shape

(1625, 8)

In [146]:
# dataset = dataset.join(dataset_One)
dataset = pd.concat([dataset, dataset1], axis=1)
dataset.head()

Unnamed: 0,Peptides,Result,A,B,C,D,E,F,G,H
0,SLNLRETN,1,S,L,N,L,R,E,T,N
1,AECFRIFD,1,A,E,C,F,R,I,F,D
2,HLVEALYL,1,H,L,V,E,A,L,Y,L
3,TQIMFETF,1,T,Q,I,M,F,E,T,F
4,AEELAEIF,1,A,E,E,L,A,E,I,F


In [147]:
dataset = dataset.drop(columns="Peptides", axis=0)
dataset.head()

Unnamed: 0,Result,A,B,C,D,E,F,G,H
0,1,S,L,N,L,R,E,T,N
1,1,A,E,C,F,R,I,F,D
2,1,H,L,V,E,A,L,Y,L
3,1,T,Q,I,M,F,E,T,F
4,1,A,E,E,L,A,E,I,F


In [148]:
# rearrange col dataset
dataset = dataset[['A','B','C','D','E','F','G','H','Result']]
dataset.head()

Unnamed: 0,A,B,C,D,E,F,G,H,Result
0,S,L,N,L,R,E,T,N,1
1,A,E,C,F,R,I,F,D,1
2,H,L,V,E,A,L,Y,L,1
3,T,Q,I,M,F,E,T,F,1
4,A,E,E,L,A,E,I,F,1


In [149]:
print("dataset has {} rows and {} Columns".format(dataset.shape[0],dataset.shape[1]))

dataset has 1625 rows and 9 Columns


In [150]:
dataset.head()

Unnamed: 0,A,B,C,D,E,F,G,H,Result
0,S,L,N,L,R,E,T,N,1
1,A,E,C,F,R,I,F,D,1
2,H,L,V,E,A,L,Y,L,1
3,T,Q,I,M,F,E,T,F,1
4,A,E,E,L,A,E,I,F,1


In [151]:
#separate dataset into x_train and y_train
from sklearn.model_selection import train_test_split
train,test=train_test_split(dataset,test_size=0.20,random_state=0)

In [152]:
train.shape,test.shape

((1300, 9), (325, 9))

In [153]:
test.to_csv('test.csv',index=False,encoding='utf-8')

In [154]:
#split the train data into train and cross validation
train_data,cv_data=train_test_split(train,test_size=0.20,random_state=0)

In [155]:
train_data.shape,cv_data.shape

((1040, 9), (260, 9))

In [156]:
x_train=train_data.iloc[:,:-1].values
y_train=train_data.iloc[:,8].values

In [157]:
x_train.shape,y_train.shape

((1040, 8), (1040,))

In [158]:
x_cv=cv_data.iloc[:,:-1].values
y_cv=cv_data.iloc[:,8].values

In [159]:
x_cv.shape,y_cv.shape

((260, 8), (260,))

In [160]:
type(x_cv)

numpy.ndarray

In [161]:
x_train = pd.DataFrame(x_train)
type(x_train)

pandas.core.frame.DataFrame

In [162]:
x_cv=pd.DataFrame(x_cv)

In [163]:
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
class DecisionTree:

  
    def onehotcode(self,x_data):
        # from collections import defaultdict
        d = defaultdict(LabelEncoder)
        # Encoding the variable
        fit = x_data.apply(lambda x: d[x.name].fit_transform(x))
        # Inverse the encoded
        fit.apply(lambda x: d[x.name].inverse_transform(x))
        # Using the dictionary to label future data
        x_data.apply(lambda x: d[x.name].transform(x))
        one_hot_encode = OneHotEncoder()
        one_hot_encode.fit(x_data)
        x_data=one_hot_encode.transform(x_data).toarray()
        return x_data
    
    def label_encoding(sel,y_train):
        #Encoding the dependent variable
        labelencoder_y=LabelEncoder()
        # print(labelencoder_y)
        y_train=labelencoder_y.fit_transform(y_train)
        # print(y_train)
        return y_train
    
    def feature_scaling(self,dataset):
        #feature scaling
        sc=StandardScaler()
        x_train=sc.fit_transform(dataset)
        return x_train,sc


In [164]:
obj=DecisionTree()
dataset=obj.onehotcode(x_train)
# print(dataset)
y_train=obj.label_encoding(y_train)
# print("label data on y_train",y_train)
x_train,sc_train=obj.feature_scaling(dataset)
# print(x_train)

In [165]:
x_cv=obj.onehotcode(x_cv)
# print(x_cv)
y_cv=obj.label_encoding(y_cv)
# print("label data on y_cv data",y_cv)
x_cv,sc_cv=obj.feature_scaling(x_cv)
# print(x_train)

In [166]:
#fitting the Desicion Tree to the training set
from sklearn.tree import DecisionTreeClassifier
classifier=DecisionTreeClassifier(criterion='entropy',random_state=0)
classifier.fit(x_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=0, splitter='best')

In [167]:
#predicting the train set result
y_pred=classifier.predict(x_train)
dataset=pd.DataFrame({'Actual':y_train,'Predicted':y_pred})
dataset.head()

Unnamed: 0,Actual,Predicted
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


In [168]:
#predicting the cross validation result
y_predict=classifier.predict(x_cv)
dataset=pd.DataFrame({'Actual':y_cv,'Predicted':y_predict})
dataset.head()

Unnamed: 0,Actual,Predicted
0,1,1
1,0,0
2,0,0
3,0,0
4,0,0


In [169]:
#making the confusion matrix
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_train,y_pred)
cm

array([[785,   0],
       [  0, 255]])

In [170]:
#check accuracy for X train data
Acc_Train=sklearn.metrics.precision_score(y_pred,y_train)*100
print("accurancy for train data:=",Acc_Train)

accurancy for train data:= 100.0


In [171]:
#check accuracy for x_cv data
Acc_cv=sklearn.metrics.accuracy_score(y_predict,y_cv)*100
print("accurancy for cross validation:=",Acc_cv)

accurancy for cross validation:= 90.38461538461539


In [172]:
# create model with pickle file
import pickle
file = open('training.pkl', 'wb')
pickle.dump(classifier,file)
pickle.dump(sc_train,file)
file.close()