In [1]:
import numpy as np
import pandas as pd
from Bio import SeqIO

In [15]:
def read_fasta_file(fasta_file):
    sequence = ""

    for record in SeqIO.parse(fasta_file, "fasta"):
        sequence += str(record.seq)

    return sequence

In [16]:
def parse_gtf_file(gtf_file):
    exons = {}

    with open(gtf_file, 'r') as f:
        for line in f:
            if not line.startswith('#'):
                cols = line.strip().split('\t')
                feature_type = cols[2]
                if feature_type == 'exon':
                    chromosome = cols[0]
                    start = int(cols[3])
                    end = int(cols[4])
                    gene_id = cols[8].split(';')[0].split('"')[1]
                    if gene_id not in exons:
                        exons[gene_id] = []
                    exons[gene_id].append((start, end))

    return exons

In [17]:
fasta_0_Coli = './data/samples/sample_0/reference_genome_545778205.fasta'
fasta_0_Subtilis = './data/samples/sample_5/reference_genome_255767013.fasta'
fasta_1_Coli = './data/samples/sample_3/reference_genome_682117612.fasta'
fasta_1_Subtilis = './data/samples/sample_6/reference_genome_1678549200.fasta'

fasta_0_Coli = read_fasta_file(fasta_0_Coli)
fasta_0_Subtilis = read_fasta_file(fasta_0_Subtilis)
fasta_1_Coli = read_fasta_file(fasta_1_Coli)
fasta_1_Subtilis = read_fasta_file(fasta_1_Subtilis)

In [18]:
gtf_0_Coli = './data/samples/sample_0/545778205_gtf_merged.gtf'
gtf_0_Subtilis = './data/samples/sample_5/255767013_gtf_merged.gtf'
gtf_1_Coli = './data/samples/sample_3/682117612_gtf_merged.gtf'
gtf_1_Subtilis = './data/samples/sample_6/1678549200_gtf_merged.gtf'

gtf_0_Coli = parse_gtf_file(gtf_0_Coli)
gtf_0_Subtilis = parse_gtf_file(gtf_0_Subtilis)
gtf_1_Coli = parse_gtf_file(gtf_1_Coli)
gtf_1_Subtilis = parse_gtf_file(gtf_1_Subtilis)

In [19]:
n_pad = 300
n_annotate = 300
n_window = 900

dict_ = []

for sequence, exon in zip([fasta_0_Coli, fasta_1_Coli, fasta_0_Subtilis, fasta_1_Subtilis],
                           [gtf_0_Coli, gtf_1_Coli, gtf_0_Subtilis, gtf_1_Subtilis]):
    for j in range(0, len(sequence), 100):
        befor_start = j - n_pad
        annotate_start = j
        after_start = j + n_annotate + n_pad

        befor_end = j
        annotate_end = min(j + n_annotate, len(sequence))
        after_end = j + n_window

        window = []

        if befor_start < 0:
            window += ['N'] * abs(befor_start)
            window += sequence[0 : befor_end]
        else:
            window += sequence[befor_start : befor_end]

        window += sequence[annotate_start : annotate_end]

        if after_end > len(sequence):
            window += sequence[after_start : len(sequence)]
            window += ['N'] * abs(n_window - len(window))
        else:
            window += sequence[after_start : after_end]

        it = 0
        for k in range(annotate_start, annotate_end):
            it += 1 if any(start <= k < end for transcript_exons in exon.values() for start, end in transcript_exons) else 0

        label = 1 if it / n_annotate > 0.66 else 0

        iteration_dict = {
            'seq_overlap': ''.join(window),
            'seq_med': sequence[annotate_start : annotate_end],
            'label': label
        }

        dict_.append(iteration_dict)



In [21]:
df = pd.DataFrame(dict_)
df['label'].value_counts()

label
1    95587
0    81415
Name: count, dtype: int64

In [22]:
def fix_window(row):
    overlap_len = len(row['seq_overlap'])
    med_len = len(row['seq_med'])

    if overlap_len < 900:
        row['seq_overlap'] += 'N' * (900 - overlap_len)

    if med_len < 300:
        row['seq_med'] += 'N' * (300 - med_len)

    return row

df = df.apply(fix_window, axis = 1)

In [23]:
def dismiss_N(row):
    if 'N' in row['seq_overlap'] or 'N' in row['seq_med']:
        return False
    else:
        return True
    

df = df[df.apply(dismiss_N, axis=1)]
df['label'].value_counts()

label
1    95555
0    81399
Name: count, dtype: int64

In [24]:
df_label_0 = df[df['label'] == 0]
df_label_1 = df[df['label'] == 1]

df_label_0.to_csv('df_label_0.csv', index = False)
df_label_1.to_csv('df_label_1.csv', index = False)

In [18]:
def write_fasta(df, label, filename_prefix):
    with open(f'{filename_prefix}_{label}.fasta', 'w') as f:
        for idx, row in df.iterrows():
            sequence = row['seq_med']
            f.write(f'>BA000007.3_{idx}\n')
            f.write(f'{sequence}\n')


write_fasta(df_label_0, 0, 'class_0')
write_fasta(df_label_1, 1, 'class_1')

In [None]:
!python3 ../../Soft/MathFeature/preprocessing/preprocessing.py -i class_0_0.fasta -o class_0_0_preprocess.fasta
!python3 ../../Soft/MathFeature/preprocessing/preprocessing.py -i class_1_1.fasta -o class_1_1_preprocess.fasta

In [None]:
!python3 ../../Soft/MathFeature/methods/ExtractionTechniques.py -i class_0_0_preprocess.fasta -o class_0_feature_1.csv -l DNA -t DNC -seq 1
!python3 ../../Soft/MathFeature/methods/ExtractionTechniques.py -i class_1_1_preprocess.fasta -o class_1_feature_1.csv -l DNA -t DNC -seq 1

In [None]:
!python3 ../../Soft/MathFeature/methods/ExtractionTechniques.py -i class_0_0_preprocess.fasta -o class_0_feature_2.csv -l DNA -t TNC -seq 1
!python3 ../../Soft/MathFeature/methods/ExtractionTechniques.py -i class_1_1_preprocess.fasta -o class_1_feature_2.csv -l DNA -t TNC -seq 1

In [None]:
!python3 ../../Soft/MathFeature/methods/FourierClass.py -i class_0_0_preprocess.fasta -o class_0_feature_3.csv -l mRNA -r 3
!python3 ../../Soft/MathFeature/methods/FourierClass.py -i class_1_1_preprocess.fasta -o class_1_feature_3.csv -l mRNA -r 3

In [None]:
!python3 ../../Soft/MathFeature/methods/EntropyClass.py -i class_0_0_preprocess.fasta -o class_0_feature_4.csv -l mRNA -k 10 -e Shannon
!python3 ../../Soft/MathFeature/methods/EntropyClass.py -i class_1_1_preprocess.fasta -o class_1_feature_4.csv -l mRNA -k 10 -e Shannon

In [None]:
!python3 ../../Soft/MathFeature/methods/AccumulatedNucleotideFrequency.py -n 1 -o class_0_feature_5.csv -r 2

In [None]:
!python3 ../../Soft/MathFeature/methods/AccumulatedNucleotideFrequency.py -n 1 -o class_1_feature_5.csv -r 2

In [None]:
!python3 ../../Soft/MathFeature/methods/CodingClass.py -i class_0_0_preprocess.fasta -o class_0_feature_6.csv -l lncRNA
!python3 ../../Soft/MathFeature/methods/CodingClass.py -i class_1_1_preprocess.fasta -o class_1_feature_6.csv -l lncRNA

In [None]:
!python3 ../../Soft/MathFeature/methods/FickettScore.py -i class_0_0_preprocess.fasta -o class_0_feature_7.csv -l lncRNA -seq 1
!python3 ../../Soft/MathFeature/methods/FickettScore.py -i class_1_1_preprocess.fasta -o class_1_feature_7.csv -l lncRNA -seq 1

In [8]:
for i in range(1, 8): 
    for j in ['class_0_feature_', 'class_1_feature_']:
        df_temp = pd.read_csv(open(j + str(i) + '.csv', 'rb'))
        df_temp = df_temp.drop_duplicates(keep = 'first')
        df_temp.to_csv(j + str(i) + '.csv', index = False)

In [20]:
df_0 = pd.read_csv(open('df_label_0.csv', 'rb'))
df_0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81399 entries, 0 to 81398
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   seq_overlap  81399 non-null  object
 1   seq_med      81399 non-null  object
 2   label        81399 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.9+ MB


In [22]:
df_add = pd.read_csv(open('class_0_feature_1.csv', 'rb'))
df_add = df_add.drop(['nameseq', 'label'], axis = 1)
df_add.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81399 entries, 0 to 81398
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AA      81399 non-null  float64
 1   AC      81399 non-null  float64
 2   AG      81399 non-null  float64
 3   AT      81399 non-null  float64
 4   CA      81399 non-null  float64
 5   CC      81399 non-null  float64
 6   CG      81399 non-null  float64
 7   CT      81399 non-null  float64
 8   GA      81399 non-null  float64
 9   GC      81399 non-null  float64
 10  GG      81399 non-null  float64
 11  GT      81399 non-null  float64
 12  TA      81399 non-null  float64
 13  TC      81399 non-null  float64
 14  TG      81399 non-null  float64
 15  TT      81399 non-null  float64
dtypes: float64(16)
memory usage: 9.9 MB


In [24]:
df_0 = pd.concat([df_0, df_add], axis = 1)
df_0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81399 entries, 0 to 81398
Data columns (total 19 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   seq_overlap  81399 non-null  object 
 1   seq_med      81399 non-null  object 
 2   label        81399 non-null  int64  
 3   AA           81399 non-null  float64
 4   AC           81399 non-null  float64
 5   AG           81399 non-null  float64
 6   AT           81399 non-null  float64
 7   CA           81399 non-null  float64
 8   CC           81399 non-null  float64
 9   CG           81399 non-null  float64
 10  CT           81399 non-null  float64
 11  GA           81399 non-null  float64
 12  GC           81399 non-null  float64
 13  GG           81399 non-null  float64
 14  GT           81399 non-null  float64
 15  TA           81399 non-null  float64
 16  TC           81399 non-null  float64
 17  TG           81399 non-null  float64
 18  TT           81399 non-null  float64
dtypes: f

In [25]:
df_add = pd.read_csv(open('class_0_feature_2.csv', 'rb'))
df_add = df_add.drop(['nameseq', 'label'], axis = 1)
df_add.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81399 entries, 0 to 81398
Data columns (total 64 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AAA     81399 non-null  float64
 1   AAC     81399 non-null  float64
 2   AAG     81399 non-null  float64
 3   AAT     81399 non-null  float64
 4   ACA     81399 non-null  float64
 5   ACC     81399 non-null  float64
 6   ACG     81399 non-null  float64
 7   ACT     81399 non-null  float64
 8   AGA     81399 non-null  float64
 9   AGC     81399 non-null  float64
 10  AGG     81399 non-null  float64
 11  AGT     81399 non-null  float64
 12  ATA     81399 non-null  float64
 13  ATC     81399 non-null  float64
 14  ATG     81399 non-null  float64
 15  ATT     81399 non-null  float64
 16  CAA     81399 non-null  float64
 17  CAC     81399 non-null  float64
 18  CAG     81399 non-null  float64
 19  CAT     81399 non-null  float64
 20  CCA     81399 non-null  float64
 21  CCC     81399 non-null  float64
 22

In [26]:
df_0 = pd.concat([df_0, df_add], axis = 1)
df_0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81399 entries, 0 to 81398
Data columns (total 83 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   seq_overlap  81399 non-null  object 
 1   seq_med      81399 non-null  object 
 2   label        81399 non-null  int64  
 3   AA           81399 non-null  float64
 4   AC           81399 non-null  float64
 5   AG           81399 non-null  float64
 6   AT           81399 non-null  float64
 7   CA           81399 non-null  float64
 8   CC           81399 non-null  float64
 9   CG           81399 non-null  float64
 10  CT           81399 non-null  float64
 11  GA           81399 non-null  float64
 12  GC           81399 non-null  float64
 13  GG           81399 non-null  float64
 14  GT           81399 non-null  float64
 15  TA           81399 non-null  float64
 16  TC           81399 non-null  float64
 17  TG           81399 non-null  float64
 18  TT           81399 non-null  float64
 19  AAA 

In [29]:
df_add = pd.read_csv(open('class_0_feature_3.csv', 'rb'))
df_add = df_add.drop(['nameseq', 'label'], axis = 1).add_suffix('_Fourier')
df_add.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81399 entries, 0 to 81398
Data columns (total 19 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   average_Fourier                        81399 non-null  float64
 1   median_Fourier                         81399 non-null  float64
 2   maximum_Fourier                        81399 non-null  float64
 3   minimum_Fourier                        81399 non-null  float64
 4   peak_Fourier                           81399 non-null  float64
 5   none_levated_peak_Fourier              81399 non-null  float64
 6   sample_standard_deviation_Fourier      81399 non-null  float64
 7   population_standard_deviation_Fourier  81399 non-null  float64
 8   percentile15_Fourier                   81399 non-null  float64
 9   percentile25_Fourier                   81399 non-null  float64
 10  percentile50_Fourier                   81399 non-null  float64
 11  pe

In [30]:
df_0 = pd.concat([df_0, df_add], axis = 1)
df_0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81399 entries, 0 to 81398
Columns: 102 entries, seq_overlap to kurtosis_Fourier
dtypes: float64(99), int64(1), object(2)
memory usage: 63.3+ MB


In [32]:
df_add = pd.read_csv(open('class_0_feature_4.csv', 'rb'))
df_add = df_add.drop(['nameseq', 'label'], axis = 1)
df_add.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81399 entries, 0 to 81398
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   k1      81399 non-null  float64
 1   k2      81399 non-null  float64
 2   k3      81399 non-null  float64
 3   k4      81399 non-null  float64
 4   k5      81399 non-null  float64
 5   k6      81399 non-null  float64
 6   k7      81399 non-null  float64
 7   k8      81399 non-null  float64
 8   k9      81399 non-null  float64
 9   k10     81399 non-null  float64
dtypes: float64(10)
memory usage: 6.2 MB


In [33]:
df_0 = pd.concat([df_0, df_add], axis = 1)
df_0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81399 entries, 0 to 81398
Columns: 112 entries, seq_overlap to k10
dtypes: float64(109), int64(1), object(2)
memory usage: 69.6+ MB


In [37]:
df_add = pd.read_csv(open('class_0_feature_5.csv', 'rb'))
df_add = df_add.drop(['nameseq', 'label'], axis = 1).add_suffix("_Accumulated")
df_add.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81399 entries, 0 to 81398
Data columns (total 19 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   average_Accumulated                        81399 non-null  float64
 1   median_Accumulated                         81399 non-null  float64
 2   maximum_Accumulated                        81399 non-null  float64
 3   minimum_Accumulated                        81399 non-null  float64
 4   peak_Accumulated                           81399 non-null  float64
 5   none_levated_peak_Accumulated              81399 non-null  float64
 6   sample_standard_deviation_Accumulated      81399 non-null  float64
 7   population_standard_deviation_Accumulated  81399 non-null  float64
 8   percentile15_Accumulated                   81399 non-null  float64
 9   percentile25_Accumulated                   81399 non-null  float64
 10  percentile50_Accumulat

In [38]:
df_0 = pd.concat([df_0, df_add], axis = 1)
df_0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81399 entries, 0 to 81398
Columns: 131 entries, seq_overlap to kurtosis_Accumulated
dtypes: float64(128), int64(1), object(2)
memory usage: 81.4+ MB


In [39]:
df_add = pd.read_csv(open('class_0_feature_6.csv', 'rb'))
df_add = df_add.drop(['nameseq', 'label'], axis = 1)
df_add.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81399 entries, 0 to 81398
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   maximum_ORF_length      81399 non-null  int64  
 1   minimum_ORF_length      81399 non-null  int64  
 2   std_ORF_length          81399 non-null  float64
 3   average_ORF_length      81399 non-null  float64
 4   cv_ORF_length           81399 non-null  float64
 5   maximum_GC_content_ORF  81399 non-null  float64
 6   minimum_GC_content_ORF  81399 non-null  float64
 7   std_GC_content_ORF      81399 non-null  float64
 8   average_GC_content_ORF  81399 non-null  float64
 9   cv_GC_content_ORF       81399 non-null  float64
dtypes: float64(8), int64(2)
memory usage: 6.2 MB


In [40]:
df_0 = pd.concat([df_0, df_add], axis = 1)
df_0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81399 entries, 0 to 81398
Columns: 141 entries, seq_overlap to cv_GC_content_ORF
dtypes: float64(136), int64(3), object(2)
memory usage: 87.6+ MB


In [41]:
df_add = pd.read_csv(open('class_0_feature_7.csv', 'rb'))
df_add = df_add.drop(['nameseq', 'label'], axis = 1)
df_add.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81399 entries, 0 to 81398
Data columns (total 2 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   fickett_score-ORF            81399 non-null  float64
 1   fickett_score-full-sequence  81399 non-null  float64
dtypes: float64(2)
memory usage: 1.2 MB


In [42]:
df_0 = pd.concat([df_0, df_add], axis = 1)
df_0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81399 entries, 0 to 81398
Columns: 143 entries, seq_overlap to fickett_score-full-sequence
dtypes: float64(138), int64(3), object(2)
memory usage: 88.8+ MB


In [43]:
df_0.to_csv("df_lable_0_merged.csv", index = False)

In [45]:
df_1 = pd.read_csv(open('df_label_1.csv', 'rb'))

df_add = pd.read_csv(open('class_1_feature_1.csv', 'rb'))
df_add = df_add.drop(['nameseq', 'label'], axis = 1)
df_1 = pd.concat([df_1, df_add], axis = 1)

df_add = pd.read_csv(open('class_1_feature_2.csv', 'rb'))
df_add = df_add.drop(['nameseq', 'label'], axis = 1)
df_1 = pd.concat([df_1, df_add], axis = 1)

df_add = pd.read_csv(open('class_1_feature_3.csv', 'rb'))
df_add = df_add.drop(['nameseq', 'label'], axis = 1).add_suffix('_Fourier')
df_1 = pd.concat([df_1, df_add], axis = 1)

df_add = pd.read_csv(open('class_1_feature_4.csv', 'rb'))
df_add = df_add.drop(['nameseq', 'label'], axis = 1)
df_1 = pd.concat([df_1, df_add], axis = 1)

df_add = pd.read_csv(open('class_1_feature_5.csv', 'rb'))
df_add = df_add.drop(['nameseq', 'label'], axis = 1).add_suffix("_Accumulated")
df_1 = pd.concat([df_1, df_add], axis = 1)

df_add = pd.read_csv(open('class_1_feature_6.csv', 'rb'))
df_add = df_add.drop(['nameseq', 'label'], axis = 1)
df_1 = pd.concat([df_1, df_add], axis = 1)

df_add = pd.read_csv(open('class_1_feature_7.csv', 'rb'))
df_add = df_add.drop(['nameseq', 'label'], axis = 1)
df_1 = pd.concat([df_1, df_add], axis = 1)

df_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95555 entries, 0 to 95554
Columns: 143 entries, seq_overlap to fickett_score-full-sequence
dtypes: float64(138), int64(3), object(2)
memory usage: 104.3+ MB


In [46]:
df_1.to_csv("df_lable_1_merged.csv", index = False)

In [50]:
df_all = pd.concat([df_0, df_1], axis = 0).reset_index(drop = True)
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 176954 entries, 0 to 176953
Columns: 143 entries, seq_overlap to fickett_score-full-sequence
dtypes: float64(138), int64(3), object(2)
memory usage: 193.1+ MB


In [51]:
df_all.sample(frac = 1).to_csv("df_whole.csv", index = False)