In [2]:
import numpy as np
import pandas as pd
from Bio import SeqIO

In [2]:
def read_fasta_file(fasta_file):
    sequence = ""

    for record in SeqIO.parse(fasta_file, "fasta"):
        sequence += str(record.seq)

    return sequence

In [3]:
def parse_gtf_file(gtf_file):
    exons = {}

    with open(gtf_file, 'r') as f:
        for line in f:
            if not line.startswith('#'):
                cols = line.strip().split('\t')
                feature_type = cols[2]
                if feature_type == 'exon':
                    chromosome = cols[0]
                    start = int(cols[3])
                    end = int(cols[4])
                    gene_id = cols[8].split(';')[0].split('"')[1]
                    if gene_id not in exons:
                        exons[gene_id] = []
                    exons[gene_id].append((start, end))

    return exons

In [4]:
fasta_0_Coli = './data/samples/sample_0/reference_genome_545778205.fasta'
fasta_0_Subtilis = './data/samples/sample_5/reference_genome_255767013.fasta'
fasta_1_Coli = './data/samples/sample_3/reference_genome_682117612.fasta'
fasta_1_Subtilis = './data/samples/sample_6/reference_genome_1678549200.fasta'

fasta_0_Coli = read_fasta_file(fasta_0_Coli)
fasta_0_Subtilis = read_fasta_file(fasta_0_Subtilis)
fasta_1_Coli = read_fasta_file(fasta_1_Coli)
fasta_1_Subtilis = read_fasta_file(fasta_1_Subtilis)

In [5]:
gtf_0_Coli = './data/samples/sample_0/545778205_gtf_merged.gtf'
gtf_0_Subtilis = './data/samples/sample_5/255767013_gtf_merged.gtf'
gtf_1_Coli = './data/samples/sample_3/682117612_gtf_merged.gtf'
gtf_1_Subtilis = './data/samples/sample_6/1678549200_gtf_merged.gtf'

gtf_0_Coli = parse_gtf_file(gtf_0_Coli)
gtf_0_Subtilis = parse_gtf_file(gtf_0_Subtilis)
gtf_1_Coli = parse_gtf_file(gtf_1_Coli)
gtf_1_Subtilis = parse_gtf_file(gtf_1_Subtilis)

In [18]:
n_pad = 300
n_annotate = 300
n_window = 900

dict_ = []

for sequence, exon in zip([fasta_0_Coli, fasta_1_Coli, fasta_0_Subtilis, fasta_1_Subtilis],
                           [gtf_0_Coli, gtf_1_Coli, gtf_0_Subtilis, gtf_1_Subtilis]):
    for j in range(0, len(sequence), 100):
        befor_start = j - n_pad
        annotate_start = j
        after_start = j + n_annotate + n_pad

        befor_end = j
        annotate_end = min(j + n_annotate, len(sequence))
        after_end = j + n_window

        window = []

        if befor_start < 0:
            window += ['N'] * abs(befor_start)
            window += sequence[0 : befor_end]
        else:
            window += sequence[befor_start : befor_end]

        window += sequence[annotate_start : annotate_end]

        if after_end > len(sequence):
            window += sequence[after_start : len(sequence)]
            window += ['N'] * abs(n_window - len(window))
        else:
            window += sequence[after_start : after_end]

        it = 0
        n_start = 0
        p_start = True
        n_end = 0
        p_end = True
        for k in range(annotate_start, annotate_end):
            exon_transcript = 1 if any(start - 1 <= k < end - 1 for transcript_exons in exon.values() for start, end in transcript_exons) else 0

            it += exon_transcript 
            if it > 0 and p_start:
                n_start = k - annotate_start
                p_start = False

            if exon_transcript and p_end:
                n_end = k - annotate_start

            if n_start and p_end and not exon_transcript:
                p_end = False

        label = 1 if it / n_annotate > 0.66 else 0
        n_start, n_end = (0, 0) if not label else (n_start, n_end)

        iteration_dict = {
            'seq_overlap': ''.join(window),
            'seq_med': sequence[annotate_start : annotate_end],
            'label': label,
            'n_start' : n_start,
            'n_end' : n_end
        }

        dict_.append(iteration_dict)



In [19]:
df = pd.DataFrame(dict_)
df['label'].value_counts()

label
1    95590
0    81412
Name: count, dtype: int64

In [20]:
def fix_window(row):
    overlap_len = len(row['seq_overlap'])
    med_len = len(row['seq_med'])

    if overlap_len < 900:
        row['seq_overlap'] += 'N' * (900 - overlap_len)

    if med_len < 300:
        row['seq_med'] += 'N' * (300 - med_len)

    return row

df = df.apply(fix_window, axis = 1)

In [21]:
def dismiss_N(row):
    if 'N' in row['seq_overlap'] or 'N' in row['seq_med']:
        return False
    else:
        return True
    

df = df[df.apply(dismiss_N, axis=1)]
df['label'].value_counts()

label
1    95558
0    81396
Name: count, dtype: int64

In [22]:
df_label_0 = df[df['label'] == 0]
df_label_1 = df[df['label'] == 1]

df_label_0.to_csv('df_label_0.csv', index = False)
df_label_1.to_csv('df_label_1.csv', index = False)

In [23]:
def write_fasta(df, label, filename_prefix):
    with open(f'{filename_prefix}_{label}.fasta', 'w') as f:
        for idx, row in df.iterrows():
            sequence = row['seq_med']
            f.write(f'>BA000007.3_{idx}\n')
            f.write(f'{sequence}\n')


write_fasta(df_label_0, 0, 'class_0')
write_fasta(df_label_1, 1, 'class_1')

In [None]:
!python3 ../../Soft/MathFeature/preprocessing/preprocessing.py -i class_0_0.fasta -o class_0_0_preprocess.fasta
!python3 ../../Soft/MathFeature/preprocessing/preprocessing.py -i class_1_1.fasta -o class_1_1_preprocess.fasta

In [None]:
!python3 ../../Soft/MathFeature/methods/ExtractionTechniques.py -i class_0_0_preprocess.fasta -o class_0_feature_1.csv -l DNA -t DNC -seq 1
!python3 ../../Soft/MathFeature/methods/ExtractionTechniques.py -i class_1_1_preprocess.fasta -o class_1_feature_1.csv -l DNA -t DNC -seq 1

In [None]:
!python3 ../../Soft/MathFeature/methods/ExtractionTechniques.py -i class_0_0_preprocess.fasta -o class_0_feature_2.csv -l DNA -t TNC -seq 1
!python3 ../../Soft/MathFeature/methods/ExtractionTechniques.py -i class_1_1_preprocess.fasta -o class_1_feature_2.csv -l DNA -t TNC -seq 1

In [None]:
!python3 ../../Soft/MathFeature/methods/FourierClass.py -i class_0_0_preprocess.fasta -o class_0_feature_3.csv -l mRNA -r 3
!python3 ../../Soft/MathFeature/methods/FourierClass.py -i class_1_1_preprocess.fasta -o class_1_feature_3.csv -l mRNA -r 3

In [None]:
!python3 ../../Soft/MathFeature/methods/EntropyClass.py -i class_0_0_preprocess.fasta -o class_0_feature_4.csv -l mRNA -k 10 -e Shannon
!python3 ../../Soft/MathFeature/methods/EntropyClass.py -i class_1_1_preprocess.fasta -o class_1_feature_4.csv -l mRNA -k 10 -e Shannon

In [None]:
!python3 ../../Soft/MathFeature/methods/AccumulatedNucleotideFrequency.py -n 1 -o class_0_feature_5.csv -r 2

In [None]:
!python3 ../../Soft/MathFeature/methods/AccumulatedNucleotideFrequency.py -n 1 -o class_1_feature_5.csv -r 2

In [None]:
!python3 ../../Soft/MathFeature/methods/CodingClass.py -i class_0_0_preprocess.fasta -o class_0_feature_6.csv -l lncRNA
!python3 ../../Soft/MathFeature/methods/CodingClass.py -i class_1_1_preprocess.fasta -o class_1_feature_6.csv -l lncRNA

In [None]:
!python3 ../../Soft/MathFeature/methods/FickettScore.py -i class_0_0_preprocess.fasta -o class_0_feature_7.csv -l lncRNA -seq 1
!python3 ../../Soft/MathFeature/methods/FickettScore.py -i class_1_1_preprocess.fasta -o class_1_feature_7.csv -l lncRNA -seq 1

In [4]:
for i in range(1, 8): 
    for j in ['class_0_feature_', 'class_1_feature_']:
        df_temp = pd.read_csv(open(j + str(i) + '.csv', 'rb'))
        df_temp = df_temp.drop_duplicates(keep = 'first')
        df_temp.to_csv(j + str(i) + '.csv', index = False)
        print(df_temp.duplicated().sum())

0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [5]:
df_0 = pd.read_csv(open('df_label_0.csv', 'rb'))
df_0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81396 entries, 0 to 81395
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   seq_overlap  81396 non-null  object
 1   seq_med      81396 non-null  object
 2   label        81396 non-null  int64 
 3   n_start      81396 non-null  int64 
 4   n_end        81396 non-null  int64 
dtypes: int64(3), object(2)
memory usage: 3.1+ MB


In [6]:
df_add = pd.read_csv(open('class_0_feature_1.csv', 'rb'))
df_add = df_add.drop(['nameseq', 'label'], axis = 1)
df_add.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81396 entries, 0 to 81395
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AA      81396 non-null  float64
 1   AC      81396 non-null  float64
 2   AG      81396 non-null  float64
 3   AT      81396 non-null  float64
 4   CA      81396 non-null  float64
 5   CC      81396 non-null  float64
 6   CG      81396 non-null  float64
 7   CT      81396 non-null  float64
 8   GA      81396 non-null  float64
 9   GC      81396 non-null  float64
 10  GG      81396 non-null  float64
 11  GT      81396 non-null  float64
 12  TA      81396 non-null  float64
 13  TC      81396 non-null  float64
 14  TG      81396 non-null  float64
 15  TT      81396 non-null  float64
dtypes: float64(16)
memory usage: 9.9 MB


In [7]:
df_0 = pd.concat([df_0, df_add], axis = 1)
df_0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81396 entries, 0 to 81395
Data columns (total 21 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   seq_overlap  81396 non-null  object 
 1   seq_med      81396 non-null  object 
 2   label        81396 non-null  int64  
 3   n_start      81396 non-null  int64  
 4   n_end        81396 non-null  int64  
 5   AA           81396 non-null  float64
 6   AC           81396 non-null  float64
 7   AG           81396 non-null  float64
 8   AT           81396 non-null  float64
 9   CA           81396 non-null  float64
 10  CC           81396 non-null  float64
 11  CG           81396 non-null  float64
 12  CT           81396 non-null  float64
 13  GA           81396 non-null  float64
 14  GC           81396 non-null  float64
 15  GG           81396 non-null  float64
 16  GT           81396 non-null  float64
 17  TA           81396 non-null  float64
 18  TC           81396 non-null  float64
 19  TG  

In [8]:
df_add = pd.read_csv(open('class_0_feature_2.csv', 'rb'))
df_add = df_add.drop(['nameseq', 'label'], axis = 1)
df_add.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81396 entries, 0 to 81395
Data columns (total 64 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AAA     81396 non-null  float64
 1   AAC     81396 non-null  float64
 2   AAG     81396 non-null  float64
 3   AAT     81396 non-null  float64
 4   ACA     81396 non-null  float64
 5   ACC     81396 non-null  float64
 6   ACG     81396 non-null  float64
 7   ACT     81396 non-null  float64
 8   AGA     81396 non-null  float64
 9   AGC     81396 non-null  float64
 10  AGG     81396 non-null  float64
 11  AGT     81396 non-null  float64
 12  ATA     81396 non-null  float64
 13  ATC     81396 non-null  float64
 14  ATG     81396 non-null  float64
 15  ATT     81396 non-null  float64
 16  CAA     81396 non-null  float64
 17  CAC     81396 non-null  float64
 18  CAG     81396 non-null  float64
 19  CAT     81396 non-null  float64
 20  CCA     81396 non-null  float64
 21  CCC     81396 non-null  float64
 22

In [9]:
df_0 = pd.concat([df_0, df_add], axis = 1)
df_0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81396 entries, 0 to 81395
Data columns (total 85 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   seq_overlap  81396 non-null  object 
 1   seq_med      81396 non-null  object 
 2   label        81396 non-null  int64  
 3   n_start      81396 non-null  int64  
 4   n_end        81396 non-null  int64  
 5   AA           81396 non-null  float64
 6   AC           81396 non-null  float64
 7   AG           81396 non-null  float64
 8   AT           81396 non-null  float64
 9   CA           81396 non-null  float64
 10  CC           81396 non-null  float64
 11  CG           81396 non-null  float64
 12  CT           81396 non-null  float64
 13  GA           81396 non-null  float64
 14  GC           81396 non-null  float64
 15  GG           81396 non-null  float64
 16  GT           81396 non-null  float64
 17  TA           81396 non-null  float64
 18  TC           81396 non-null  float64
 19  TG  

In [10]:
df_add = pd.read_csv(open('class_0_feature_3.csv', 'rb'))
df_add = df_add.drop(['nameseq', 'label'], axis = 1).add_suffix('_Fourier')
df_add.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81396 entries, 0 to 81395
Data columns (total 19 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   average_Fourier                        81396 non-null  float64
 1   median_Fourier                         81396 non-null  float64
 2   maximum_Fourier                        81396 non-null  float64
 3   minimum_Fourier                        81396 non-null  float64
 4   peak_Fourier                           81396 non-null  float64
 5   none_levated_peak_Fourier              81396 non-null  float64
 6   sample_standard_deviation_Fourier      81396 non-null  float64
 7   population_standard_deviation_Fourier  81396 non-null  float64
 8   percentile15_Fourier                   81396 non-null  float64
 9   percentile25_Fourier                   81396 non-null  float64
 10  percentile50_Fourier                   81396 non-null  float64
 11  pe

In [11]:
df_0 = pd.concat([df_0, df_add], axis = 1)
df_0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81396 entries, 0 to 81395
Columns: 104 entries, seq_overlap to kurtosis_Fourier
dtypes: float64(99), int64(3), object(2)
memory usage: 64.6+ MB


In [12]:
df_add = pd.read_csv(open('class_0_feature_4.csv', 'rb'))
df_add = df_add.drop(['nameseq', 'label'], axis = 1)
df_add.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81396 entries, 0 to 81395
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   k1      81396 non-null  float64
 1   k2      81396 non-null  float64
 2   k3      81396 non-null  float64
 3   k4      81396 non-null  float64
 4   k5      81396 non-null  float64
 5   k6      81396 non-null  float64
 6   k7      81396 non-null  float64
 7   k8      81396 non-null  float64
 8   k9      81396 non-null  float64
 9   k10     81396 non-null  float64
dtypes: float64(10)
memory usage: 6.2 MB


In [13]:
df_0 = pd.concat([df_0, df_add], axis = 1)
df_0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81396 entries, 0 to 81395
Columns: 114 entries, seq_overlap to k10
dtypes: float64(109), int64(3), object(2)
memory usage: 70.8+ MB


In [14]:
df_add = pd.read_csv(open('class_0_feature_5.csv', 'rb'))
df_add = df_add.drop(['nameseq', 'label'], axis = 1).add_suffix("_Accumulated")
df_add.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81396 entries, 0 to 81395
Data columns (total 19 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   average_Accumulated                        81396 non-null  float64
 1   median_Accumulated                         81396 non-null  float64
 2   maximum_Accumulated                        81396 non-null  float64
 3   minimum_Accumulated                        81396 non-null  float64
 4   peak_Accumulated                           81396 non-null  float64
 5   none_levated_peak_Accumulated              81396 non-null  float64
 6   sample_standard_deviation_Accumulated      81396 non-null  float64
 7   population_standard_deviation_Accumulated  81396 non-null  float64
 8   percentile15_Accumulated                   81396 non-null  float64
 9   percentile25_Accumulated                   81396 non-null  float64
 10  percentile50_Accumulat

In [15]:
df_0 = pd.concat([df_0, df_add], axis = 1)
df_0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81396 entries, 0 to 81395
Columns: 133 entries, seq_overlap to kurtosis_Accumulated
dtypes: float64(128), int64(3), object(2)
memory usage: 82.6+ MB


In [16]:
df_add = pd.read_csv(open('class_0_feature_6.csv', 'rb'))
df_add = df_add.drop(['nameseq', 'label'], axis = 1)
df_add.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81396 entries, 0 to 81395
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   maximum_ORF_length      81396 non-null  int64  
 1   minimum_ORF_length      81396 non-null  int64  
 2   std_ORF_length          81396 non-null  float64
 3   average_ORF_length      81396 non-null  float64
 4   cv_ORF_length           81396 non-null  float64
 5   maximum_GC_content_ORF  81396 non-null  float64
 6   minimum_GC_content_ORF  81396 non-null  float64
 7   std_GC_content_ORF      81396 non-null  float64
 8   average_GC_content_ORF  81396 non-null  float64
 9   cv_GC_content_ORF       81396 non-null  float64
dtypes: float64(8), int64(2)
memory usage: 6.2 MB


In [17]:
df_0 = pd.concat([df_0, df_add], axis = 1)
df_0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81396 entries, 0 to 81395
Columns: 143 entries, seq_overlap to cv_GC_content_ORF
dtypes: float64(136), int64(5), object(2)
memory usage: 88.8+ MB


In [18]:
df_add = pd.read_csv(open('class_0_feature_7.csv', 'rb'))
df_add = df_add.drop(['nameseq', 'label'], axis = 1)
df_add.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81396 entries, 0 to 81395
Data columns (total 2 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   fickett_score-ORF            81396 non-null  float64
 1   fickett_score-full-sequence  81396 non-null  float64
dtypes: float64(2)
memory usage: 1.2 MB


In [19]:
df_0 = pd.concat([df_0, df_add], axis = 1)
df_0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81396 entries, 0 to 81395
Columns: 145 entries, seq_overlap to fickett_score-full-sequence
dtypes: float64(138), int64(5), object(2)
memory usage: 90.0+ MB


In [20]:
df_0.to_csv("df_lable_0_merged.csv", index = False)

In [21]:
df_1 = pd.read_csv(open('df_label_1.csv', 'rb'))

df_add = pd.read_csv(open('class_1_feature_1.csv', 'rb'))
df_add = df_add.drop(['nameseq', 'label'], axis = 1)
df_1 = pd.concat([df_1, df_add], axis = 1)

df_add = pd.read_csv(open('class_1_feature_2.csv', 'rb'))
df_add = df_add.drop(['nameseq', 'label'], axis = 1)
df_1 = pd.concat([df_1, df_add], axis = 1)

df_add = pd.read_csv(open('class_1_feature_3.csv', 'rb'))
df_add = df_add.drop(['nameseq', 'label'], axis = 1).add_suffix('_Fourier')
df_1 = pd.concat([df_1, df_add], axis = 1)

df_add = pd.read_csv(open('class_1_feature_4.csv', 'rb'))
df_add = df_add.drop(['nameseq', 'label'], axis = 1)
df_1 = pd.concat([df_1, df_add], axis = 1)

df_add = pd.read_csv(open('class_1_feature_5.csv', 'rb'))
df_add = df_add.drop(['nameseq', 'label'], axis = 1).add_suffix("_Accumulated")
df_1 = pd.concat([df_1, df_add], axis = 1)

df_add = pd.read_csv(open('class_1_feature_6.csv', 'rb'))
df_add = df_add.drop(['nameseq', 'label'], axis = 1)
df_1 = pd.concat([df_1, df_add], axis = 1)

df_add = pd.read_csv(open('class_1_feature_7.csv', 'rb'))
df_add = df_add.drop(['nameseq', 'label'], axis = 1)
df_1 = pd.concat([df_1, df_add], axis = 1)

df_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95558 entries, 0 to 95557
Columns: 145 entries, seq_overlap to fickett_score-full-sequence
dtypes: float64(138), int64(5), object(2)
memory usage: 105.7+ MB


In [22]:
df_1.to_csv("df_lable_1_merged.csv", index = False)

In [23]:
df_all = pd.concat([df_0, df_1], axis = 0).reset_index(drop = True)
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 176954 entries, 0 to 176953
Columns: 145 entries, seq_overlap to fickett_score-full-sequence
dtypes: float64(138), int64(5), object(2)
memory usage: 195.8+ MB


In [24]:
df_all.sample(frac = 1).to_csv("df_whole.csv", index = False)