# Usfull fasta files of all the detected TEs after filteration
1. Fasta file for each nematode species in `TE_sequences_by_nematode_species`
2. Fasta file for each TE superfamily in `TE_sequences_by_TE_superfamily`

In [43]:
import TE, pickle, os
from Bio import SeqIO

codes_in_final_data = \
['Rcul','Tspi','Ttri','Tmur','Ebre','Avit','Asuu','Bmal','Dimm','Lsig',
 'Lloa','Ovol','Smur','Wban','Bxyl','Gpal','Gros','Mchi','Mhap','Mflo',
 'Minc','Pred','Ptri','Pcof','Rsim','Spap','Srat','Sven','Hduj','Crem',
 'Cele','C11sp','briC','C5sp','Cang','Cjap','Cbre','Dviv','Hcon','Hbac',
 'Otip','Ppac',
]

assembly_fname = \
TE.genomes_dict('./Genomes/', code_file='genome_assembly_files_v3.csv')

assemblies_dir = '../../Genomes/'

Here I make a dictionary of decoded contig names. The contig names in the TEs dictionary are coded because I had to shorten them for RepeatMasker

In [None]:
contig_ids = {}

for code in assembly_fname:
    contig_ids[assembly_fname[code]] = {}
    
    log_file = assemblies_dir+assembly_fname[code]+'.log'
    coded_assembly = assemblies_dir+assembly_fname[code]+'.coded'

    TE.code_sequence_ids(assemblies_dir+assembly_fname[code],
                         log_file,
                         coded_assembly,
                         code)
    
    lines =  open(log_file,'r').read().split('\\n')
    
    for l in lines:
        if not '\\t' in l:
            continue
        coded, real = l.rstrip().split('\\t')
        contig_ids[assembly_fname[code]][coded] = real

Read the TEs dictionary

In [31]:
hndl = open('TE_counts/TEs.pkl','rb')
TEs = pickle.load(hndl)
hndl.close()

Make fasta directories

In [59]:
if not os.path.exists('TE_sequences_by_nematode_species'):
    os.mkdir('TE_sequences_by_nematode_species')
if not os.path.exists('TE_sequences_by_TE_superfamily'):
    os.mkdir('TE_sequences_by_TE_superfamily')

This is how an entry looks like for each TE in the TEs dictionary. We only bother with `taken` TEs (ones that passed the various filters)

In [32]:
TEs['Ovol']['taken']['element1']

{'contig': 'Ovol_10',
 'end': 107,
 'higher_tx_level': 'DNA/Helitron',
 'length': 94,
 'lower_tx_level': 'OOCH_RND-3_FAMILY-49',
 'ref': {'program': 'RMOCFA',
  'record': '###667\t12.8\t0.0\t0.0\tOvol_10\t14\t107\t94\tC\tOOCH_RND-3_FAMILY-49\tDNA/RC\t(1191)\t1123\t1030\t\t1\t0.041\n'},
 'start': 14}

Write fasta files:

In [60]:
for code in codes_in_final_data:
    
    # open a fasta file for the speices
    hndl = open('TE_sequences_by_nematode_species/{code}.fasta'.format(code=code),'wt')
    
    # make a fasta formated string for each entry in the TEs dictionary
    assembly_name = assembly_fname[code]
    assembly_fpath = assemblies_dir + assembly_name
    contigs = SeqIO.to_dict(SeqIO.parse(assembly_fpath, 'fasta'))
    code_TEs = TEs[code]['taken']
    for el in code_TEs:
        te = code_TEs[el]
        contig = contig_ids[assembly_name][te['contig']]
        start = int(te['start'])
        end = int(te['end'])
        taxonomy = te['higher_tx_level']
        if not 'element' in te['lower_tx_level']:
            taxonomy += '/' + te['lower_tx_level']
        program = te['ref']['program']
        record = te['ref']['record']
        seq = str(contigs[contig].seq[start:end])
        title = "{el} {taxonomy}|{assembly_name}|{contig}|{start}|{end}|{program}|{record}"
        fasta = title.format(el=el,
                            taxonomy=taxonomy,
                            assembly_name=assembly_name,
                            contig=contig,
                            start=start,
                            end=end,
                            program=program,
                            record=record).rstrip()
        fasta += '\n{seq}\n'.format(seq=seq)
        
        # Write the fasta entry to the nematode species file
        hndl.write('>'+fasta)
        
        # allso append the fasta entry to the TE superfamily file
        te_fname = te['higher_tx_level'].replace('/','_')
        with open('TE_sequences_by_TE_superfamily/{0}.fasta'.format(te_fname),'a') as tehndl:
            tehndl.write('>'+code+'|'+fasta)
        
    hndl.close()
        
        
        