In [3]:
import numpy as np
import pandas as pd
from Bio import SeqIO

In [4]:
def read_fasta_file(fasta_file):
    sequence = ""

    for record in SeqIO.parse(fasta_file, "fasta"):
        sequence += str(record.seq)

    return sequence

In [10]:
def parse_gtf_file(gtf_file):
    exons = {}

    with open(gtf_file, 'r') as f:
        for line in f:
            if not line.startswith('#'):
                cols = line.strip().split('\t')
                feature_type = cols[2]
                if feature_type == 'exon':
                    chromosome = cols[0]
                    start = int(cols[3])
                    end = int(cols[4])
                    gene_id = cols[8].split(';')[0].split('"')[1]
                    if gene_id not in exons:
                        exons[gene_id] = []
                    exons[gene_id].append((start, end))

    return exons

In [9]:
fasta_Coli = './data/samples/sample_0/reference_genome_545778205.fasta'
fasta_Subtilis = './data/samples/sample_5/reference_genome_255767013.fasta'

fasta_Coli = read_fasta_file(fasta_Coli)
fasta_Subtilis = read_fasta_file(fasta_Subtilis)

In [16]:
gtf_Coli = './data/samples/sample_0/545778205_gtf_merged.gtf'
gtf_Subtilis = './data/samples/sample_5/255767013_gtf_merged.gtf'

gtf_Coli = parse_gtf_file(gtf_Coli)
gtf_Subtilis = parse_gtf_file(gtf_Subtilis)

In [72]:
n_pad = 300
n_annotate = 300
n_window = 900

dict_ = []

for sequence, exon in zip([fasta_Coli, fasta_Subtilis], [gtf_Coli, gtf_Subtilis]):
    for j in range(0, len(sequence), 150):
        befor_start = j - n_pad
        annotate_start = j
        after_start = j + n_annotate + n_pad

        befor_end = j
        annotate_end = min(j + n_annotate, len(sequence))
        after_end = j + n_window

        window = []

        if befor_start < 0:
            window += ['N'] * abs(befor_start)
            window += sequence[0 : befor_end]
        else:
            window += sequence[befor_start : befor_end]

        window += sequence[annotate_start : annotate_end]

        if after_end > len(sequence):
            window += sequence[after_start : len(sequence)]
            window += ['N'] * abs(n_window - len(window))
        else:
            window += sequence[after_start : after_end]

        it = 0
        for k in range(annotate_start, annotate_end):
            it += 1 if any(start <= k < end for transcript_exons in exon.values() for start, end in transcript_exons) else 0

        label = 1 if it / n_annotate > 0.5 else 0

        iteration_dict = {
            'seq_overlap': ''.join(window),
            'seq_med': sequence[annotate_start : annotate_end],
            'label': label
        }

        dict_.append(iteration_dict)



In [73]:
df = pd.DataFrame(dict_)
df['label'].value_counts()

label
0    30235
1    28815
Name: count, dtype: int64

In [79]:
def fix_window(row):
    overlap_len = len(row['seq_overlap'])
    med_len = len(row['seq_med'])

    if overlap_len < 900:
        row['seq_overlap'] += 'N' * (900 - overlap_len)

    if med_len < 300:
        row['seq_med'] += 'N' * (300 - med_len)

    return row

df = df.apply(fix_window, axis = 1)

In [82]:
df_label_0 = df[df['label'] == 0]
df_label_1 = df[df['label'] == 1]

df_label_0.to_csv('df_label_0.csv', index = False)
df_label_1.to_csv('df_label_1.csv', index = False)

In [83]:
def write_fasta(df, label, filename_prefix):
    with open(f'{filename_prefix}_{label}.fasta', 'w') as f:
        for idx, row in df.iterrows():
            sequence = row['seq_overlap']
            f.write(f'>BA000007.3_{idx}\n')
            f.write(f'{sequence}\n')


write_fasta(df_label_0, 0, 'class_0')
write_fasta(df_label_1, 1, 'class_1')

In [None]:
!python3 ../../Soft/MathFeature/preprocessing/preprocessing.py -i class_0_0.fasta -o class_0_0_preprocess.fasta
!python3 ../../Soft/MathFeature/preprocessing/preprocessing.py -i class_1_1.fasta -o class_1_1_preprocess.fasta

In [None]:
!python3 ../../Soft/MathFeature/methods/ExtractionTechniques.py -i class_0_0_preprocess.fasta -o class_0_feature_1.csv -l DNA -t DNC -seq 1
!python3 ../../Soft/MathFeature/methods/ExtractionTechniques.py -i class_1_1_preprocess.fasta -o class_1_feature_1.csv -l DNA -t DNC -seq 1

In [None]:
!python3 ../../Soft/MathFeature/methods/ExtractionTechniques.py -i class_0_0_preprocess.fasta -o class_0_feature_2.csv -l DNA -t TNC -seq 1
!python3 ../../Soft/MathFeature/methods/ExtractionTechniques.py -i class_1_1_preprocess.fasta -o class_1_feature_2.csv -l DNA -t TNC -seq 1

In [None]:
!python3 ../../Soft/MathFeature/methods/FourierClass.py -i class_0_0_preprocess.fasta -o class_0_feature_3.csv -l mRNA -r 3
!python3 ../../Soft/MathFeature/methods/FourierClass.py -i class_1_1_preprocess.fasta -o class_1_feature_3.csv -l mRNA -r 3

In [None]:
!python3 ../../Soft/MathFeature/methods/EntropyClass.py -i class_0_0_preprocess.fasta -o class_0_feature_4.csv -l mRNA -k 10 -e Shannon
!python3 ../../Soft/MathFeature/methods/EntropyClass.py -i class_1_1_preprocess.fasta -o class_1_feature_4.csv -l mRNA -k 10 -e Shannon

In [None]:
!python3 ../../Soft/MathFeature/methods/AccumulatedNucleotideFrequency.py -n 1 -o class_0_feature_5.csv -r 2

In [None]:
!python3 ../../Soft/MathFeature/methods/AccumulatedNucleotideFrequency.py -n 1 -o class_1_feature_5.csv -r 2

In [None]:
!python3 ../../Soft/MathFeature/methods/CodingClass.py -i class_0_0_preprocess.fasta -o class_0_feature_6.csv -l lncRNA
!python3 ../../Soft/MathFeature/methods/CodingClass.py -i class_1_1_preprocess.fasta -o class_1_feature_6.csv -l lncRNA

In [None]:
!python3 ../../Soft/MathFeature/methods/FickettScore.py -i class_0_0_preprocess.fasta -o class_0_feature_7.csv -l lncRNA -seq 1
!python3 ../../Soft/MathFeature/methods/FickettScore.py -i class_1_1_preprocess.fasta -o class_1_feature_7.csv -l lncRNA -seq 1

In [6]:
for i in range(1, 8): 
    for j in ['class_0_feature_', 'class_1_feature_']:
        df_temp = pd.read_csv(open(j + str(i) + '.csv', 'rb'))
        df_temp = df_temp.drop_duplicates(keep = 'first')
        df_temp.to_csv(j + str(i) + '.csv', index = False)