# AUTOMATED GENE FINDER
# Introduction To Bio Informatics

# By:
#    Muhammad Hamza
#    P15-6148

# To:
#    Dr. Hafeez
    


## LIBRARIES USED

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import keras
from keras.utils import to_categorical

Using TensorFlow backend.


In [2]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LeakyReLU
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

## ENCODING FUNCTION 

In [3]:
def encode(data):
    print('Shape of data (BEFORE encode): %s' % str(data.shape))
    encoded = to_categorical(data,5)
    print('Shape of data (AFTER  encode): %s\n' % str(encoded.shape))
    return encoded

## IMPORTING DATA

In [4]:
data = pd.read_csv('E:\\data')
data.columns = ['ind', 'seq', 'outputClass']
data.columns.values


array(['ind', 'seq', 'outputClass'], dtype=object)

# PREPROCESSING

In [5]:
data = data.drop('ind',axis = 1)
print(data.head())
print(data.columns.values)

                                                 seq  outputClass
0  CGCCTGTAATCCCAGCACTCTGGGAGGCAGAGGTGGGCCGATCACT...            1
1  GGCAGACTCCCAATCCTTATTCATTGGGTAAATGGAGAGAACAGTG...            1
2  ACACTTCCGTTTCCGGTCCGTGCCCTTGGGGCTCCGTGTCCTGCTG...            1
3  GGCTCCCACACCACTGCCTCGTGTGGGGTTGTTCGCCCGTGAAGGG...            1
4  AGACCGCGGTGACGTCTCCACCGCGCCAAACTCACTGAAAATCAAA...            1
['seq' 'outputClass']


## LENGTH OF LONGEST SEQUENCE

In [6]:
data.seq.map(lambda x: len(x)).max()


29980

## SEPARATING FEATURES AND OUTPUT CLASS

In [7]:
Y = data['outputClass']
X = data['seq']
print(Y.head())
print(X.head())


0    1
1    1
2    1
3    1
4    1
Name: outputClass, dtype: int64
0    CGCCTGTAATCCCAGCACTCTGGGAGGCAGAGGTGGGCCGATCACT...
1    GGCAGACTCCCAATCCTTATTCATTGGGTAAATGGAGAGAACAGTG...
2    ACACTTCCGTTTCCGGTCCGTGCCCTTGGGGCTCCGTGTCCTGCTG...
3    GGCTCCCACACCACTGCCTCGTGTGGGGTTGTTCGCCCGTGAAGGG...
4    AGACCGCGGTGACGTCTCCACCGCGCCAAACTCACTGAAAATCAAA...
Name: seq, dtype: object


## SPLITING DATA INTO TRAINING AND TESTING SET

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, Y,test_size=0.2)

In [9]:
print("\nX_train:\n")
print(X_train.head())
print(y_train.head())

print(X_train.shape)

print("\nX_test:\n")
print(X_test.head())
print(y_test.head())

print(X_test.shape)


X_train:

391     GGATGGCTCTGAAGTGGACTTCAGTTCTTCTGCTGATACATCTCGG...
5784    GCGGAGCGGTGCACGCCGGGAGTTGCAGTTCCCGGGCGAGGCAGTT...
6478    GGCGGAGAGGGCTGAGGGCGGGTCGCTTCCGGCGGCTTTTCCCGCA...
3756    CCCGCCTCCCTAGGCGTGGAGGAGGGGGGGCGGCTCAGCCCCGCGC...
6374    GATTGGCCCCTGGGCCAGGGGCGGGGCACTCGCGGCGGAGGCAAGC...
Name: seq, dtype: object
391     1
5784    0
6478    0
3756    0
6374    0
Name: outputClass, dtype: int64
(5246,)

X_test:

3506    CCCTCCCCTGCTCTCGCAAGGAGAAAGCGGGCGACGAGCGCTCGCA...
4008    CTCCTTCCTGCAAGAAGCGTTGCCCGTTGGCTAGCTGCTCGGTGGG...
2510    CAGGAACCGCGGCTGCTGGACAAGAGGGGTGCGGTGGATACTGACC...
506     TCTTCACGTCCCAGCGCGGGTGGGCGCCGGCGGCTCCTCTTAACCA...
3647    ACAGACTGCCAAATGGAACAGACAAGCAGGTTGTCTTGGTAAGCAA...
Name: seq, dtype: object
3506    0
4008    0
2510    1
506     1
3647    0
Name: outputClass, dtype: int64
(1312,)


In [10]:
keys = X_train.keys()
for i in keys:
    X_train[i] = list(X_train[i])

keys = X_test.keys()
for i in keys:
    X_test[i] = list(X_test[i])
    

In [11]:
print(X_train.head())
print(X_test.head())


391     [G, G, A, T, G, G, C, T, C, T, G, A, A, G, T, ...
5784    [G, C, G, G, A, G, C, G, G, T, G, C, A, C, G, ...
6478    [G, G, C, G, G, A, G, A, G, G, G, C, T, G, A, ...
3756    [C, C, C, G, C, C, T, C, C, C, T, A, G, G, C, ...
6374    [G, A, T, T, G, G, C, C, C, C, T, G, G, G, C, ...
Name: seq, dtype: object
3506    [C, C, C, T, C, C, C, C, T, G, C, T, C, T, C, ...
4008    [C, T, C, C, T, T, C, C, T, G, C, A, A, G, A, ...
2510    [C, A, G, G, A, A, C, C, G, C, G, G, C, T, G, ...
506     [T, C, T, T, C, A, C, G, T, C, C, C, A, G, C, ...
3647    [A, C, A, G, A, C, T, G, C, C, A, A, A, T, G, ...
Name: seq, dtype: object


# ONE HOT VECTOR ENCODING 

In [18]:
label_encoder = LabelEncoder()

keys = X_test.keys()
for i in keys:
        
    integer_encoded_seq = label_encoder.fit_transform(X_test[i])
    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded_seq = integer_encoded_seq.reshape(len(integer_encoded_seq), 1)
    onehot_encoded_seq = onehot_encoder.fit_transform(integer_encoded_seq)
    X_test[i] = onehot_encoded_seq

keys = X_train.keys()
for i in keys:
        
    integer_encoded_seq = label_encoder.fit_transform(X_train[i])
    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded_seq = integer_encoded_seq.reshape(len(integer_encoded_seq), 1)
    onehot_encoded_seq = onehot_encoder.fit_transform(integer_encoded_seq)
    X_train[i] = onehot_encoded_seq    

In [17]:
print(X_test.head())

3506    [[0.0, 1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0....
4008    [[0.0, 1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0....
2510    [[0.0, 1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0....
506     [[0.0, 0.0, 0.0, 0.0, 1.0], [0.0, 1.0, 0.0, 0....
3647    [[1.0, 0.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0....
Name: seq, dtype: object


In [14]:
X_train = np.arange(X_train).reshape(5246,29980,5).all()
#keys = X_test.keys()
#print(X_test[keys[0]].shape)
#X_train.shape
X_train.shape

MemoryError: 

# NEURAL NETWORK MODEL

In [16]:
model = Sequential()
model.add(Dense(units=100))
model.add(LeakyReLU(alpha=0.1))
model.add(Dense(units=75))
model.add(LeakyReLU(alpha=0.1))
model.add(Dense(units=50))
model.add(LeakyReLU(alpha=0.1))
model.add(Dense(units=25))
model.add(Dense(units=1, activation='sigmoid'))



In [17]:

model.compile(loss='mean_squared_error',optimizer='sgd',metrics=['accuracy'])


In [15]:
model.fit(X_train, y_train, epochs=5, batch_size=32)