~/DP/data/chromosomes/Canis_lupus_familiaris_with_introns_simplified_chr_only_no_duplicates.txt



In [1]:
import os
import pandas as pd
print(os.getcwd())

/home/lieberze/DP


In [2]:
root_folder = ''
chr_folder = os.path.abspath(os.path.join(root_folder, 'data/chromosomes/'))
fna_folder = os.path.abspath(os.path.join(root_folder, 'data/'))

Get filenames for future use

In [31]:
var = !ls ~/DP/data
names = []
for i in var:
    if i.endswith("fna"):
        names.append(i.split(".")[0])
names

['Bos_taurus',
 'Canis_lupus_familiaris',
 'Equus_caballus',
 'Felis_catus',
 'Monodelphis_domestica',
 'Mus_musculus',
 'Ornitohoryhynchus_anatinus',
 'Ovis_aries',
 'Rattus_norvegicus',
 'Sus_scrofa']

## Extract intergenic coordinates

In [5]:
def get_intergenic(df_sorted, file):
    end = 0
    for line in df_sorted.iterrows():
        ch = line[1].chr
        start_new = line[1].start
        end_new = line[1].end
              
        if start_new != end + 1:
            start_intergenic = end + 1
            end_intergenic = start_new - 1
            file.write(f"{ch}\tintergenic\t{start_intergenic}\t{end_intergenic}\n")
            end = end_new
        else:
            end = end_new

In [6]:
def open_process_and_write(name):
    with open(chr_folder + f"/intergenic/{name}_chr_only_intergenic_no_duplicates.txt", "w") as file_out:
        path = chr_folder + f"/{name}_with_introns_simplified_chr_only_no_duplicates.txt"
        df = pd.read_csv(path, sep="\t", header=None)
        df.columns = ["chr", "type", "start", "end"]
        chrs = df.chr.unique()
        for ch in chrs:
            df_chr = df[df["chr"] == ch]
            df_chr_sorted = df_chr.sort_values(by=["start"], axis=0)
            get_intergenic(df_chr_sorted, file_out)

In [7]:
for name in names:
    open_process_and_write(name)

## Extract intergenic sequences from .fna

Get chromosome names

In [22]:
def get_chromosome_names(name):
    df = pd.read_csv(chr_folder + f"/intergenic/{name}_chr_only_intergenic_no_duplicates.txt", sep="\t", header=None)
    df.columns = ["chr", "type", "start", "end"]
    chr_list = df.chr.unique()
    return chr_list

In [23]:
def load_chromosome_sequences(name):
    chr_list = get_chromosome_names(name)
    chromosome_list = {}
    with open(f"data/{name}.fna", "r") as fasta:
        sequence = ""
        chr_name_saved = ""
        write_sequence = False
        for line in fasta:
            line = line.strip()
            if line.startswith(">"):
                chr_name = line.split()[0].strip(">")
                if chr_name_saved == "": 
                    if chr_name in chr_list:
                        write_sequence = True
                        chr_name_saved = chr_name
                        sequence = ""
                # case: saving a new name:sequence pair to dictionary
                else:
                    chromosome_list[chr_name_saved] = sequence
                    if chr_name in chr_list:
                        chr_name_saved = chr_name
                        write_sequence = True
                        sequence = ""
                    else:
                        write_sequence = False
                        chr_name_saved = ""
            else:
                if write_sequence == True:
                    sequence += line
        return chromosome_list

In [33]:
def extract_for_all_names(name):
    chromosome_list = load_chromosome_sequences(name)
    # print(chromosome_list.keys())
    all_intergenic = 0
    problematic = 0
    with open(f"data/chromosomes/intergenic/{name}_chr_only_intergenic_no_duplicates.txt", "r") as extracted_gff,\
        open(f"data/ready_to_use/{name}_intergenic_no_duplicates.txt", "w") as file_out,\
        open(f"data/ready_to_use/{name}_intergenic_error_info_no_duplicates.txt", "w") as file_error:
        for line in extracted_gff:
            line = line.strip()

            line_split = line.split()
            chr_number = line_split[0]
            start = int(line_split[2])
            stop = int(line_split[3])
            
            print(chr_number, start, stop)

            # start - 1 kvuli indexaci od 1
            sequence = chromosome_list[chr_number][start-1:stop]
            all_intergenic += 1
            if "N" in sequence:
                problematic += 1
                file_error.write("POZOR, máme 'N' v sekvenci:\n" + line +" "+ sequence + "\n")
            file_out.write(line + "\t" + sequence + "\n")

        report = f"pocet intergenovych intronu obsahujicich N: {problematic} ({(problematic/all_intergenic) * 100} %)"
        print(report)
        file_error.write(report+"\n")

In [25]:
names

['Bos_taurus',
 'Canis_lupus_familiaris',
 'Equus_caballus',
 'Felis_catus',
 'Monodelphis_domestica',
 'Mus_musculus',
 'Ornitohoryhynchus_anatinus',
 'Ovis_aries',
 'Rattus_norvegicus',
 'Sus_scrofa']

In [32]:
for name in names:
    extract_for_all_names(name)

## Remove sequences with N (and create report)

In [38]:
def remove_N(name):
    deleted_lines = 0
    all_lines = 0
    with open(f"data/ready_to_use/{name}_intergenic_no_duplicates.txt", "r") as file_in,\
        open(f"data/ready_to_use/without_N/{name}_intergenic_no_duplicates_without_N.txt", "w") as file_out:
            for line in file_in:
                line = line.strip()
                line_list = line.split()
                all_lines += 1

                if len(line_list) < 5: # intron of length 1(?) => no sequence, no fifth entry
                    deleted_lines += 1
                else:
                    sequence = line_list[4]
                    if "N" not in sequence:
                        file_out.write(line + "\n")
                    else:
                        deleted_lines += 1
                        
    print(" ".join(name.split("_")))
    print(f"smazano {deleted_lines} sekvenci, celkem {deleted_lines/all_lines * 100} %")

In [39]:
for name in names:
    remove_N(name)

Bos taurus
smazano 104599 sekvenci, celkem 73.25116425645156 %
Canis lupus familiaris
smazano 133733 sekvenci, celkem 73.77436242573356 %
Equus caballus
smazano 96735 sekvenci, celkem 73.26521956465758 %
Felis catus
smazano 87324 sekvenci, celkem 68.87674214997278 %
Monodelphis domestica
smazano 88621 sekvenci, celkem 77.16104203670811 %
Mus musculus
smazano 174467 sekvenci, celkem 76.89564895455027 %
Ornitohoryhynchus anatinus
smazano 47229 sekvenci, celkem 59.36946110042614 %
Ovis aries
smazano 78576 sekvenci, celkem 68.23854310502045 %
Rattus norvegicus
smazano 145499 sekvenci, celkem 75.4968296302446 %
Sus scrofa
smazano 99407 sekvenci, celkem 75.65854066931021 %
