# RepeatMasker search
**Objective**: Perform TE search using homology with reference sequences at the DNA level.  
**Inputs**:  

1. Genome assembly fasta files from `Genomes/`. The files are not included but indicated in the file `Genomes/genome_assembly_files_v3.csv`, as `++`.
2. A Nematoda TE library `Library/Nematoda.lib`
**Outputs**: RepeatMasker outputs in the `RepeatMasker` directory. `.out` files will be used downstream.  
  
**Strategy**:   

1. Code contig names because thy are too long for RepeatMasker (`TE.code_sequence_ids()`) and put the codes in `*.coding_log` files.
2. Run RepeatMasker with `TE.run_repeat_masker`. The search is permisive but there will be sunsequent result filtering (other notebook):  `./RepeatMasker -cutoff 255 -norna -lib Library/Nematoda.lib -lcambig -no_id -frag 60000 -parallel 6 -excln -alignments <genome assembly>`.  
    + `cutoff 255` : permisive score cutoff - the hits are filtered later
    + `norna` : small rna is ignored
    + `frag 60000` : allow large TEs
3. By default, the output files are written to the same directory as the genome assembly so the output is moved to the `RepeatMasker` directory.

In [3]:
import TE, os
from shutil import move

genomes_directory = '/media/amir/DATA/work/Dropbox/Genomes/'

 
for code in  TE.genome_codes_list('Genomes/', mode='++',
                                  code_file='genome_assembly_files_v3.csv') : 

#repeat for some newer versions
#for code in ['Hduj', 'Gros', 'Mchi', 'Rsim', 'Pcof', 'Mjav']:

    print code

    # the genome assembly file
    query = genomes_directory+TE.genomes_dict('Genomes/', mode='++',
                                              code_file='genome_assembly_files_v3.csv')[code]
    # the reference library file
    lib = 'Library/Nematoda.lib'
    
    full_path_to_adssembly = genomes_directory+TE.genomes_dict('Genomes/',
                                                               mode='+',
                                                               code_file='genome_assembly_files_v3.csv')[code]
    just_assembly_file = TE.genomes_dict('Genomes/',
                                         mode='+',
                                         code_file='genome_assembly_files_v3.csv')[code]
    
    # make short contig names

    TE.code_sequence_ids(full_path_to_adssembly,
                         full_path_to_adssembly+'_coded.coding_log',
                         full_path_to_adssembly+'_coded.fasta',
                         code)
    
    # genome assembly file with coded contig names
    query = full_path_to_adssembly +'_coded.fasta'
    
    # run RepeatMasker
    TE.run_repeat_masker(query, species=False, lib = lib, parallel=6)
    
    # make output directory
    if not os.path.exists('RepeatMasker/'+code):
        os.mkdir('RepeatMasker/'+code)
    
    # move the outfile from the genome assembly file directory to the 
    # RepeatMasker output library
    old_outfile_name = full_path_to_adssembly +'_coded.fasta' +'.out'
    new_outfile_name = 'RepeatMasker/'+code+'/'+ just_assembly_file +'_coded.fasta' +'.out'
    move(old_outfile_name, new_outfile_name)
    
    old_outfile_name = full_path_to_adssembly +'_coded.fasta' +'.tbl'
    new_outfile_name = 'RepeatMasker/'+code+'/'+ just_assembly_file +'_coded.fasta' +'.tbl'
    move(old_outfile_name, new_outfile_name)
    
    old_outfile_name = full_path_to_adssembly +'_coded.coding_log'
    new_outfile_name = 'RepeatMasker/'+code+'/'+ just_assembly_file +'_coded.coding_log'
    move(old_outfile_name, new_outfile_name)

Hduj
/home/amir/homeWork/RM_package/RepeatMasker/RepeatMasker -cutoff 255 -norna -lib Library/Nematoda.lib -lcambig -no_id -frag 60000 -parallel 6 -excln -alignments /media/amir/DATA/work/Dropbox/Genomes/Hypsibius_dujardini_nHd.2.3.abv500.fna_coded.fasta
Gros
/home/amir/homeWork/RM_package/RepeatMasker/RepeatMasker -cutoff 255 -norna -lib Library/Nematoda.lib -lcambig -no_id -frag 60000 -parallel 6 -excln -alignments /media/amir/DATA/work/Dropbox/Genomes/nGr.v1.1.fa_coded.fasta
Mchi
/home/amir/homeWork/RM_package/RepeatMasker/RepeatMasker -cutoff 255 -norna -lib Library/Nematoda.lib -lcambig -no_id -frag 60000 -parallel 6 -excln -alignments /media/amir/DATA/work/Dropbox/Genomes/1_uw_chwd.change.a.fasta_coded.fasta
Rsim
/home/amir/homeWork/RM_package/RepeatMasker/RepeatMasker -cutoff 255 -norna -lib Library/Nematoda.lib -lcambig -no_id -frag 60000 -parallel 6 -excln -alignments /media/amir/DATA/work/Dropbox/Genomes/R_Similis_s_7_1_sequence_paired_assembly.fa_coded.fasta
Pcof
/home/amir/