In [1]:
import numpy as np
import pandas as pd
from Bio import SeqIO

In [2]:
def read_fasta_file(fasta_file):
    sequence = ""

    for record in SeqIO.parse(fasta_file, "fasta"):
        sequence += str(record.seq)

    return sequence

def parse_gtf_file(gtf_file):
    exons = {}

    with open(gtf_file, 'r') as f:
        for line in f:
            if not line.startswith('#'):
                cols = line.strip().split('\t')
                feature_type = cols[2]
                if feature_type == 'exon':
                    chromosome = cols[0]
                    start = int(cols[3])
                    end = int(cols[4])
                    gene_id = cols[8].split(';')[0].split('"')[1]
                    if gene_id not in exons:
                        exons[gene_id] = []
                    exons[gene_id].append((start, end))

    return exons

In [3]:
fasta_0_Coli = './data/samples/sample_0/reference_genome_545778205.fasta'
fasta_0_Coli = read_fasta_file(fasta_0_Coli)

gtf_0_Coli = './data/samples/sample_0/545778205_gtf_merged.gtf'
gtf_0_Coli = parse_gtf_file(gtf_0_Coli)

In [4]:
n_pad = 300
n_annotate = 300
n_window = 900

dict_ = []

for sequence, exon in zip([fasta_0_Coli],
                           [gtf_0_Coli]):
    for j in range(0, len(sequence), 100):
        befor_start = j - n_pad
        annotate_start = j
        after_start = j + n_annotate + n_pad

        befor_end = j
        annotate_end = min(j + n_annotate, len(sequence))
        after_end = j + n_window

        window = []

        if befor_start < 0:
            window += ['N'] * abs(befor_start)
            window += sequence[0 : befor_end]
        else:
            window += sequence[befor_start : befor_end]

        window += sequence[annotate_start : annotate_end]

        if after_end > len(sequence):
            window += sequence[after_start : len(sequence)]
            window += ['N'] * abs(n_window - len(window))
        else:
            window += sequence[after_start : after_end]

        it = 0
        n_start = 0
        p_start = True
        n_end = 0
        p_end = True

        for k in range(annotate_start, annotate_end):
            exon_transcript = 1 if any(start - 1 <= k < end - 1 for transcript_exons in exon.values() for start, end in transcript_exons) else 0

            it += exon_transcript 
            if it > 0 and p_start:
                n_start = k - annotate_start
                p_start = False

            if exon_transcript and p_end:
                n_end = k - annotate_start

            if n_start and p_end and not exon_transcript:
                p_end = False

        label = 1 if it / n_annotate > 0.66 else 0
        n_start, n_end = (0, 0) if not label else (n_start, n_end)

        iteration_dict = {
            'seq_overlap': ''.join(window),
            'seq_med': sequence[annotate_start : annotate_end],
            'label': label,
            'n_start' : n_start,
            'n_end' : n_end,
            'bais' : annotate_start
        }

        dict_.append(iteration_dict)



In [5]:
df = pd.DataFrame(dict_)
df['label'].value_counts()

label
0    24496
1    21921
Name: count, dtype: int64

In [6]:
def fix_window(row):
    overlap_len = len(row['seq_overlap'])
    med_len = len(row['seq_med'])

    if overlap_len < 900:
        row['seq_overlap'] += 'N' * (900 - overlap_len)

    if med_len < 300:
        row['seq_med'] += 'N' * (300 - med_len)

    return row

df = df.apply(fix_window, axis = 1)

In [7]:
def dismiss_N(row):
    if 'N' in row['seq_overlap'] or 'N' in row['seq_med']:
        return False
    else:
        return True
    

df = df[df.apply(dismiss_N, axis=1)]
df['label'].value_counts()

label
0    24494
1    21911
Name: count, dtype: int64

In [8]:
df.to_csv('test_df_label_0.csv', index = False)

In [None]:
def write_fasta(df):
    with open(f'test.fasta', 'w') as f:
        for idx, row in df.iterrows():
            sequence = row['seq_med']
            f.write(f'>BA000007.3_{idx}\n')
            f.write(f'{sequence}\n')


write_fasta(df)

In [None]:
!python3 ../../Soft/MathFeature/preprocessing/preprocessing.py -i test.fasta -o test_preproces.fasta
!python3 ../../Soft/MathFeature/methods/ExtractionTechniques.py -i test_preproces.fasta -o test_feature_1.csv -l DNA -t DNC -seq 1
!python3 ../../Soft/MathFeature/methods/ExtractionTechniques.py -i test_preproces.fasta -o test_feature_2.csv -l DNA -t TNC -seq 1
!python3 ../../Soft/MathFeature/methods/FourierClass.py -i test_preproces.fasta -o test_feature_3.csv -l mRNA -r 3
!python3 ../../Soft/MathFeature/methods/EntropyClass.py -i test_preproces.fasta -o test_feature_4.csv -l mRNA -k 10 -e Shannon

!python3 ../../Soft/MathFeature/methods/CodingClass.py -i test_preproces.fasta -o test_feature_6.csv -l lncRNA
!python3 ../../Soft/MathFeature/methods/FickettScore.py -i test_preproces.fasta -o test_feature_7.csv -l lncRNA -seq 1

In [None]:
!python3 ../../Soft/MathFeature/methods/AccumulatedNucleotideFrequency.py -n 1 -o test_feature_5.csv -r 2

In [4]:
df = pd.read_csv(open('test_df_label_0.csv', 'rb'))

df_add = pd.read_csv(open('test_feature_1.csv', 'rb'))
df_add = df_add.drop(['nameseq', 'label'], axis = 1)
df = pd.concat([df, df_add], axis = 1)

df_add = pd.read_csv(open('test_feature_2.csv', 'rb'))
df_add = df_add.drop(['nameseq', 'label'], axis = 1)
df = pd.concat([df, df_add], axis = 1)

df_add = pd.read_csv(open('test_feature_3.csv', 'rb'))
df_add = df_add.drop(['nameseq', 'label'], axis = 1).add_suffix('_Fourier')
df = pd.concat([df, df_add], axis = 1)

df_add = pd.read_csv(open('test_feature_4.csv', 'rb'))
df_add = df_add.drop(['nameseq', 'label'], axis = 1)
df = pd.concat([df, df_add], axis = 1)

df_add = pd.read_csv(open('test_feature_5.csv', 'rb'))
df_add = df_add.drop(['nameseq', 'label'], axis = 1).add_suffix("_Accumulated")
df = pd.concat([df, df_add], axis = 1)

df_add = pd.read_csv(open('test_feature_6.csv', 'rb'))
df_add = df_add.drop(['nameseq', 'label'], axis = 1)
df = pd.concat([df, df_add], axis = 1)

df_add = pd.read_csv(open('test_feature_7.csv', 'rb'))
df_add = df_add.drop(['nameseq', 'label'], axis = 1)
df = pd.concat([df, df_add], axis = 1)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46405 entries, 0 to 46404
Columns: 146 entries, seq_overlap to fickett_score-full-sequence
dtypes: float64(138), int64(6), object(2)
memory usage: 51.7+ MB


In [6]:
df.to_csv("df_test.csv")