In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [44]:
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

In [45]:
train

Unnamed: 0,NCBIGeneID,Symbol,Description,GeneType,GeneGroupIdentifier,GeneGroupMethod,NucleotideSequence
0,124900924,LOC124900924,small nucleolar RNA U3,PSEUDO,,,<GCGAGGAGGAGGCACAGGGTTCTCCCCTGAGAGCCAGGCCAGCTC...
1,106479936,RNU6-837P,"RNA, U6 small nuclear 837, pseudogene",PSEUDO,,,<GTGTTCACTTCAGGGGTATACATAGTAAAATTGGAACAATACAAA...
2,100422964,MIR3150A,microRNA 3150a,ncRNA,,,<GGGAAGCAGGCCAACCTCGACGATCTCCTCAGCACCTGAACGCCA...
3,121740704,LOC121740704,Sharpr-MPRA regulatory region 12172,BIOLOGICAL_REGION,,,<CAGGGTCTGGGTGTCGTCGCCTAGCAGCTGCCCTTGGTAGATGAG...
4,724022,MIR652,microRNA 652,ncRNA,,,<ACGAATGGCTATGCACTGCACAACCCTAGGAGAGGGTGCCATTCA...
...,...,...,...,...,...,...,...
13726,100419750,DTWD1P2,DTWD1 pseudogene 2,PSEUDO,,,<GAAAGAAAGTATTTCAAATGTAGCAGTTCAAGAATGTTCTTCAAG...
13727,114004374,LOC114004374,Sharpr-MPRA regulatory region 15222,BIOLOGICAL_REGION,,,<GATAACTACAGGCCCCCTCTCTCCCCAGCAGGTTGACTAACCTAG...
13728,100126800,SNAR-A12,small NF90 (ILF3) associated RNA A12,snRNA,,,<CCGGAGCCATTGTGGCTCAGGCAGGTTGCGCCTGCCCTCGGGCCC...
13729,124906674,LOC124906674,putative uncharacterized protein FLJ92257,PROTEIN_CODING,,,<TCAGGCCCATGGGGCTCATTCCTCACAACGGCCTTTCCAGGCCCA...


In [46]:
train.isnull().sum()

NCBIGeneID                 0
Symbol                     0
Description                0
GeneType                   0
GeneGroupIdentifier    13459
GeneGroupMethod        13459
NucleotideSequence         0
dtype: int64

In [47]:
test

Unnamed: 0,NCBIGeneID,Symbol,Description,GeneType,GeneGroupIdentifier,GeneGroupMethod,NucleotideSequence
0,113748396,LOC113748396,Sharpr-MPRA regulatory region 1133,BIOLOGICAL_REGION,,,<CACTGCAGGCCGGAGCCCCCTGTTCCCCCGCATCCTCCCCGCCGT...
1,124900888,LOC124900888,U7 small nuclear RNA,PSEUDO,,,<AAAAGCAGCTCTTTCAGAATTTGTCTAGCAGGATTTCTAGTTTTC...
2,124174249,LOC124174249,Sharpr-MPRA regulatory region 5408,BIOLOGICAL_REGION,,,<TTGTGATTATGTTATTTTACATGGCAAAAGGGACTTTGCAGCGGT...
3,28685,TRAV8-1,T cell receptor alpha variable 8-1,OTHER,,,<ATGCTCCTGTTGCTCATACCAGTGCTGGGGATGATTTTTGCCCTG...
4,730061,RPL34P33,ribosomal protein L34 pseudogene 33,PSEUDO,,,<CTCTTCCGAGACATTGTCTGCAGGCACTCAGAATGGTCCAGCGTT...
...,...,...,...,...,...,...,...
4573,120893147,LOC120893147,Sharpr-MPRA regulatory region 2220,BIOLOGICAL_REGION,,,<AATTCTGTATGATTCTACTTTTATGAGGTACTCTATGAGTGGATA...
4574,28385,IGHV6-1,immunoglobulin heavy variable 6-1,OTHER,,,<ATGTCTGTCTCCTTCCTCATCTTCCTGCCCGTGCTGGGCCTCCCA...
4575,442160,RPL21P62,ribosomal protein L21 pseudogene 62,PSEUDO,,,<CCTTTCGGCGGGAACCGCCATCTTCCAGTAATTTGCCAAGATGAC...
4576,113839573,LOC113839573,Sharpr-MPRA regulatory region 4009,BIOLOGICAL_REGION,,,<GACATTTCCTTGAAACTGCTGGAGCTGAAAGTTTGTGAAATTCTG...


In [48]:
test.isnull().sum()

NCBIGeneID                0
Symbol                    0
Description               0
GeneType                  0
GeneGroupIdentifier    4491
GeneGroupMethod        4491
NucleotideSequence        0
dtype: int64

In [49]:
train.drop('GeneGroupIdentifier', axis=1, inplace=True)

In [50]:
test.drop('GeneGroupIdentifier', axis=1, inplace=True)

In [51]:
train['GeneGroupMethod'] = train['GeneGroupMethod'].apply(lambda x: 1 if x == 'NCBI Ortholog' else 0)
test['GeneGroupMethod'] = test['GeneGroupMethod'].apply(lambda x: 1 if x == 'NCBI Ortholog' else 0)

In [52]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout
from tensorflow.keras.optimizers import Adam 

In [53]:
def encode_sequence(sequence):
    encoding = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    return [encoding.get(base, 4) for base in sequence]

In [54]:
train['EncodedSequence'] = train['NucleotideSequence'].apply(encode_sequence)
test['EncodedSequence'] = test['NucleotideSequence'].apply(encode_sequence)


In [55]:
max_sequence_length = max(train['EncodedSequence'].apply(len).max(), test['EncodedSequence'].apply(len).max())  
train_padded_sequences = pad_sequences(train['EncodedSequence'], maxlen=max_sequence_length, padding='post')
test_padded_sequences = pad_sequences(test['EncodedSequence'], maxlen=max_sequence_length, padding='post')

In [56]:
categorical_features = ['Symbol', 'Description', 'GeneType']

In [57]:
combined_data = pd.concat([train[categorical_features], test[categorical_features]], axis=0)
encoder = LabelEncoder()
for feature in categorical_features:
    encoder.fit(combined_data[feature].astype(str))
    train[feature] = encoder.transform(train[feature].astype(str))
    test[feature] = encoder.transform(test[feature].astype(str))


In [58]:
X_train_other_features = train[categorical_features].values
X_test_other_features = test[categorical_features].values

In [59]:
scaler = StandardScaler()
X_train_other_features = scaler.fit_transform(X_train_other_features)
X_test_other_features = scaler.transform(X_test_other_features)

In [60]:
X_train_sequences = train_padded_sequences
X_test_sequences = test_padded_sequences
y_train_sequences = train_padded_sequences  
y_test_sequences = test_padded_sequences