# Setup

In [2]:
# Load required modules
import glob
import pandas as pd
from IPython.display import display, Image
from Bio import SeqIO

## Parse Snakemake inputs and outputs

In [3]:
# Revised haplotype Fastas
occ1_fasta = snakemake.input[0]
occ2_fasta = snakemake.input[1]
pall1_fasta = snakemake.input[2]
pall2_fasta = snakemake.input[3]

# TrR v5 to v6 chromosome mapping
chromosome_mapping_path = snakemake.input[4]

# DF with chromosomes to reverse complement
chrs_toRevComp = snakemake.input[5]

# Df with scaffold lengths by SG and Hap
scaffs_bySG_Hap = snakemake.input[6]

# Output
haploid_reference = snakemake.output[0]
hap1_out = snakemake.output[1]
hap2_out = snakemake.output[2]

## Functions

In [129]:
# Filters haplotype fasta for scaffold to keep and returns dictionary with record ID a key and sequence as value
def get_chromosomal_scaffolds(hap_fasta, chromosomes_df, sg, hap):
    
    scaffolds_to_keep_df = TrR_v6_chromosomes[(TrR_v6_chromosomes['subgenome'] == sg) & 
                                           (TrR_v6_chromosomes['haplotype'] == hap)]
    scaffolds_to_keep_dict = scaffolds_to_keep_df.set_index('original_scaffold_name')['TrR_v6_chromosome_name'].to_dict()
    
    chromosome_records = {}
    for record in SeqIO.parse(hap_fasta, 'fasta'):
        if record.id in scaffolds_to_keep_dict.keys():
            new_chromosome_name = scaffolds_to_keep_dict[record.id]
            chromosome_records[new_chromosome_name] = record.seq
    return(chromosome_records)

# Reverse complements sequences to match TrR_v5 assembly (if necessary) and returns new dictionary
def reverse_compliment(chromosomal_records, revComp_list):
    
    all_chromosomal_records_revComp = {}
    for chrom, seq in chromosomal_records.items():
        if chrom in revComp_list:
            all_chromosomal_records_revComp[chrom] = seq.reverse_complement()
        else:
            all_chromosomal_records_revComp[chrom] = seq
    
    return(all_chromosomal_records_revComp)

# Retrieve plastid and mito sequences
def get_organellar_scaffolds(hap_fasta, organelle_name, new_name):
    
    organellar_record = {}
    for record in SeqIO.parse(hap_fasta, 'fasta'):
        if record.id == organelle_name:
            organellar_record[new_name] = record.seq
    return(organellar_record)

# Get chromosomes and unplaced scaffolds for subgenome and haplptype
# Separate dictionaries for each
def get_rename_diploid_scaffs(hap_fasta, scaffs_bySG_Hap_df, sg, hap):
    
    chroms = scaffs_bySG_Hap_df[(scaffs_bySG_Hap_df['sg'] == sg) & 
                                (scaffs_bySG_Hap_df['hap'] == hap)]

    scaff_names = chroms['scaff'].tolist()
    if hap == 'One':
        hap_num = 1
    else:
        hap_num = 2
    
    allrecs = {}
    for record in SeqIO.parse(hap_fasta, 'fasta'):
        scaff = record.id.split('__')[0]
        if scaff in scaff_names:
            new_chromosome_name = f"{chroms[chroms['scaff'] == scaff]['TrR_v6_chromosome_name'].values[0]}_Hap{hap_num}"
            allrecs[new_chromosome_name] = record.seq
        else:
            new_scaff_name = f"{sg.capitalize()}_Hap{hap_num}_{record.id}"
            allrecs[new_scaff_name] = record.seq
    
    chrom_records = {k:v for k,v in allrecs.items() if k.startswith('Chr')}
    unp_records = {k:v for k,v in allrecs.items() if not k.startswith('Chr')}
    
    return(chrom_records, unp_records)


def split_contaminated_sequences(contamination_dict, record_dict):
    new_seq_dict = {}  # Dict to store new split sequences
    # Iterate through contaminated sequence dictionary
    for cont_scaff, conts in contamination_dict.items():
        new_scaff_prefix = str.split(cont_scaff, '__')[0]  # Contig name prefix
        num_cont = len(conts)  # Number of contaminated stretches
        # Iterate through contaminated stretches
        for span_num in range(num_cont + 1):
            # If we're on the first stretch, slice to start of contamination
            if span_num == 0:
                first_pos = contamination_dict[cont_scaff][span_num][0]
                seq = record_dict[cont_scaff][:first_pos]
                new_seq_dict[f"{new_scaff_prefix}__contSplit{span_num + 1}_length{len(seq)}"] = seq
            # If we're on the last strech, slice from contamination end to end of sequence
            elif span_num == num_cont:
                last_pos = contamination_dict[cont_scaff][span_num - 1][1]
                seq = record_dict[cont_scaff][last_pos:]
                new_seq_dict[f"{new_scaff_prefix}__contSplit{span_num + 1}_length{len(seq)}"] = seq
            # Otherwise, slice from end of previous stretch to start of current one
            else:
                first_pos = contamination_dict[cont_scaff][span_num - 1][1]
                second_pos = contamination_dict[cont_scaff][span_num][0]
                seq = record_dict[cont_scaff][first_pos:second_pos]
                new_seq_dict[f"{new_scaff_prefix}__contSplit{span_num + 1}_length{len(seq)}"] = seq
    return(new_seq_dict)

# Haploid reference assembly

## Step 1: Get chromosomal scaffold sequences

In [5]:
# Load CSV file with chromosomes to keep and mapping to new names
TrR_v6_chromosomes = pd.read_csv(chromosome_mapping_path, delimiter=',')
TrR_v6_chromosomes.head()

In [6]:
# Get correct records from each haplotype.
occ1_chromosomal_records = get_chromosomal_scaffolds(occ1_fasta, TrR_v6_chromosomes, sg = 'Occ', hap = 'One')
occ2_chromosomal_records = get_chromosomal_scaffolds(occ2_fasta, TrR_v6_chromosomes, sg = 'Occ', hap = 'Two')
pall1_chromosomal_records = get_chromosomal_scaffolds(pall1_fasta, TrR_v6_chromosomes, sg = 'Pall', hap = 'One')
pall2_chromosomal_records = get_chromosomal_scaffolds(pall2_fasta, TrR_v6_chromosomes, sg = 'Pall', hap = 'Two')

# Merge dictionaries
all_chromosomal_records = occ1_chromosomal_records | occ2_chromosomal_records | pall1_chromosomal_records | pall2_chromosomal_records
all_chromosomal_records

## Step 2: Reverse compliment and reorder

In [7]:
# Chromosomes that need to be reverse complement to match orientation of Griffiths et al. TrR_v5 genome
# Determined by mapping scaffolds against the griffiths genome using minimap2 and looking for a negative correlation in the alignment positions
to_reverse_complement = pd.read_csv(chrs_toRevComp, delimiter=',')['TrR_v6_chromosome_name'].tolist()
all_chromosomal_records_revComp = reverse_compliment(all_chromosomal_records, revComp_list = to_reverse_complement)

In [8]:
# Reorder chromosomes to match order in CSV file, which matches Griffiths TrR_v5 assembly
chromosome_order = TrR_v6_chromosomes['TrR_v6_chromosome_name'].tolist()
all_chromosomal_records_revComp_ordered = {chrom : all_chromosomal_records_revComp[chrom] for chrom in chromosome_order if chrom in all_chromosomal_records_revComp}
all_chromosomal_records_revComp_ordered

## Step 3: Add organellar sequences and write file

In [9]:
# Hapltoypes and scaffold names for organelles taken from best BLAST hits of entire plastid or COI (for Mitochondria)
# Hits were identical between haplotypes for plastid, but Pall 1 had longer assembly for Mitochondria
plastid_record = get_organellar_scaffolds(occ1_fasta, 'Scaffold_27__1_contigs__length_126578', 'Plastid')
mitochondrial_record = get_organellar_scaffolds(pall1_fasta, 'Scaffold_15__1_contigs__length_370591', 'Mitochondria')

# Adde records to end of dictionary
haploid_reference_genome = all_chromosomal_records_revComp_ordered | plastid_record | mitochondrial_record
haploid_reference_genome

In [10]:
# Write fasta
with open(haploid_reference, 'w') as fout:
    for chrom, seq in haploid_reference_genome.items():
        fout.write(f'>{chrom}\n{str(seq)}\n')

# Diploid reference assembly

## Step 1: Get and rename chromosomes and unplaced scaffolds

In [12]:
# Dataframe with LGs and Scaffs for oth subgenomes and 
scaffs_bySG_Hap_df = pd.read_csv(scaffs_bySG_Hap)
scaffs_bySG_Hap_df = scaffs_bySG_Hap_df.merge(TrR_v6_chromosomes[['LG', 'TrR_v6_chromosome_name']], on = 'LG', how = 'left')
scaffs_bySG_Hap_df.head()

In [13]:
# Get chromosomes and unplaced scaffolds as separate dictionaries
occ1_chrom_recs, occ1_unp_recs = get_rename_diploid_scaffs(occ1_fasta, scaffs_bySG_Hap_df, 'occ', 'One')
occ2_chrom_recs, occ2_unp_recs = get_rename_diploid_scaffs(occ2_fasta, scaffs_bySG_Hap_df, 'occ', 'Two')
pall1_chrom_recs, pall1_unp_recs = get_rename_diploid_scaffs(pall1_fasta, scaffs_bySG_Hap_df, 'pall', 'One')
pall2_chrom_recs, pall2_unp_recs = get_rename_diploid_scaffs(pall2_fasta, scaffs_bySG_Hap_df, 'pall', 'Two')

## Step 2: Combine and reorder chromosomal scaffolds

### Haplotype 1

In [16]:
hap1_chroms = occ1_chrom_recs | pall1_chrom_recs
chromosome_order_hap1 = [f"{x}_Hap1" for x in chromosome_order]
hap1_chroms_ordered = {chrom : hap1_chroms[chrom] for chrom in chromosome_order_hap1 if chrom in hap1_chroms}
hap1_chroms_ordered

### Haplotype 2

In [17]:
hap2_chroms = occ2_chrom_recs | pall2_chrom_recs
chromosome_order_hap2 = [f"{x}_Hap2" for x in chromosome_order]
hap2_chroms_ordered = {chrom : hap2_chroms[chrom] for chrom in chromosome_order_hap2 if chrom in hap2_chroms}
hap2_chroms_ordered

In [18]:
hap2_revComp = ['Chr08_Occ_Hap2', 'Chr04_Occ_Hap2', 'Chr06_Pall_Hap2', 'Chr04_Pall_Hap2',
                'Chr03_Occ_Hap2', 'Chr06_Occ_Hap2', 'Chr07_Occ_Hap2', 'Chr08_Pall_Hap2']
hap2_chroms_ordered_revComp = reverse_compliment(hap2_chroms_ordered, revComp_list = hap2_revComp)
hap2_chroms_ordered_revComp

## Step 3: Combine, remove contamination, and reorder unplaced scaffolds

### Haplotype 1

In [130]:
hap1_unp = occ1_unp_recs | pall1_unp_recs  # Combine

# Split contaminated sequences. Based on NCBIs internal analysis 
hap1_cont_dict = {
    'Occ_Hap1_Scaffold_107__1_contigs__length_46767' : [[20586, 20847]],
    'Occ_Hap1_Scaffold_40__1_contigs__length_84372' : [[81007, 81768]],
    'Occ_Hap1_Scaffold_86__1_contigs__length_53138' : [[18033, 18066], [25511, 25664], [29634, 29670], [38491, 38530], [41489, 42604], [46126, 46162]],
    'Pall_Hap1_Scaffold_236__1_contigs__length_26579' : [[3484, 13515], [23428, 25038]],
    'Pall_Hap1_Scaffold_36__1_contigs__length_129292' : [[634, 676], [12890, 13489], [19477, 19544], [35165, 35196], [39989, 40023], [41868, 41981], [44213, 44247]],
    'Pall_Hap1_Scaffold_37__1_contigs__length_126073' : [[33411, 33448], [45162, 45196], [57377, 57410], [59451, 61291], [62297, 62337]]
}
hap1_unp_contSplit = split_contaminated_sequences(hap1_cont_dict, hap1_unp)

# Remove original contaminated scaffolds
for key in hap1_cont_dict.keys():
    hap1_unp.pop(key)
    
# Add split scaffolds
hap1_unp_contRemoved = hap1_unp | hap1_unp_contSplit

hap1_unp_lengths = {k:len(v) - 1 for k,v in hap1_unp_contRemoved.items()}
hap1_unp_lengths_ordered = {k: v for k, v in sorted(hap1_unp_lengths.items(), key=lambda item: item[1], reverse = True)}
hap1_unp_ordered = {unp : hap1_unp_contRemoved[unp] for unp in hap1_unp_lengths_ordered.keys() if unp in hap1_unp_contRemoved}

### Haplotype 2

In [134]:
hap2_unp = occ2_unp_recs | pall2_unp_recs

# Split contaminated sequences. Based on NCBIs internal analysis 
hap2_cont_dict = {
    'Occ_Hap2_Scaffold_113__1_contigs__length_46767' : [[20586, 20847]],
    'Occ_Hap2_Scaffold_15__1_contigs__length_264194' : [[130489, 132595], [134710, 134811]],
    'Occ_Hap2_Scaffold_16__1_contigs__length_261650' : [[211452, 213333], [213446, 218591], [218700, 218731], [221415, 222701]],
    'Occ_Hap2_Scaffold_30__1_contigs__length_122125' : [[88609, 88938]],
    'Occ_Hap2_Scaffold_32__1_contigs__length_119019' : [[64053, 64510], [80434, 80468]],
    'Occ_Hap2_Scaffold_49__1_contigs__length_84372' : [[81007, 81768]],
    'Occ_Hap2_Scaffold_60__1_contigs__length_73558' : [[36749, 36910], [37691, 37722], [44065, 44361], [53875, 53969]],
    'Occ_Hap2_Scaffold_95__1_contigs__length_53138' : [[18033, 18066], [25511, 25664], [29634, 29670], [38491, 38530], [41489, 42604], [46126, 46162]],
    'Pall_Hap2_Scaffold_35__1_contigs__length_126073' : [[33411, 33448], [45162, 45196], [57377, 57410], [59451, 61291], [62297, 62337]],
    'Pall_Hap2_Scaffold_53__1_contigs__length_91932' : [[55949, 56192], [65736, 65817]]
}
hap2_unp_contSplit = split_contaminated_sequences(hap2_cont_dict, hap2_unp)

# Remove original contaminated scaffolds
for key in hap2_cont_dict.keys():
    hap2_unp.pop(key)
    
# Add split scaffolds
hap2_unp_contRemoved = hap2_unp | hap2_unp_contSplit

hap2_unp_lengths = {k:len(v) - 1 for k,v in hap2_unp_contRemoved.items()}
hap2_unp_lengths_ordered = {k: v for k, v in sorted(hap2_unp_lengths.items(), key=lambda item: item[1], reverse = True)}
hap2_unp_ordered = {unp : hap2_unp_contRemoved[unp] for unp in hap2_unp_lengths_ordered.keys() if unp in hap2_unp_contRemoved}

## Step 4: Combine and write haplotype fastas

### Haplptype 1

In [29]:
hap1 = hap1_chroms_ordered | hap1_unp_ordered
with open(hap1_out, 'w') as fout:
    for chrom, seq in hap1.items():
        fout.write(f'>{chrom}\n{str(seq)}\n')

### Haplotype 2

In [36]:
hap2 = hap2_chroms_ordered_revComp | hap2_unp_ordered
with open(hap2_out, 'w') as fout:
    for chrom, seq in hap2.items():
        fout.write(f'>{chrom}\n{str(seq)}\n')