# Setup

In [2]:
# Load required modules
import glob
import pandas as pd
from IPython.display import display, Image
from Bio import SeqIO

## Parse Snakemake inputs and outputs

In [3]:
# Revised haplotype Fastas
occ1_fasta = snakemake.input[0]
occ2_fasta = snakemake.input[1]
pall1_fasta = snakemake.input[2]
pall2_fasta = snakemake.input[3]

# TrR v5 to v6 chromosome mapping
chromosome_mapping_path = snakemake.input[4]

# DF with chromosomes to reverse complement
chrs_toRevComp = snakemake.input[5]

# PAF file for Occ1
occ1_paf = snakemake.input[6]

# Output
haploid_reference = snakemake.output[0]

## Functions

In [4]:
# Filters haplotype fasta for scaffold to keep and returns dictionary with record ID a key and sequence as value
def get_chromosomal_scaffolds(hap_fasta, chromosomes_df, sg, hap):
    
    scaffolds_to_keep_df = TrR_v6_chromosomes[(TrR_v6_chromosomes['subgenome'] == sg) & 
                                           (TrR_v6_chromosomes['haplotype'] == hap)]
    scaffolds_to_keep_dict = scaffolds_to_keep_df.set_index('original_scaffold_name')['TrR_v6_chromosome_name'].to_dict()
    
    chromosome_records = {}
    for record in SeqIO.parse(hap_fasta, 'fasta'):
        if record.id in scaffolds_to_keep_dict.keys():
            new_chromosome_name = scaffolds_to_keep_dict[record.id]
            chromosome_records[new_chromosome_name] = record.seq
    return(chromosome_records)

# Reverse complements sequences to match TrR_v5 assembly (if necessary) and returns new dictionary
def reverse_compliment(chromosomal_records):
    
    all_chromosomal_records_revComp = {}
    for chrom, seq in chromosomal_records.items():
        if chrom in to_reverse_complement:
            all_chromosomal_records_revComp[chrom] = seq.reverse_complement()
        else:
            all_chromosomal_records_revComp[chrom] = seq
    
    return(all_chromosomal_records_revComp)

# Retrieve plastid and mito sequences
def get_organellar_scaffolds(hap_fasta, organelle_name, new_name):
    
    organellar_record = {}
    for record in SeqIO.parse(hap_fasta, 'fasta'):
        if record.id == organelle_name:
            organellar_record[new_name] = record.seq
    return(organellar_record)

# Haploid reference assembly

## Step 1: Get chromosomal scaffold sequences

In [5]:
# Load CSV file with chromosomes to keep and mapping to new names
TrR_v6_chromosomes = pd.read_csv(chromosome_mapping_path, delimiter=',')
TrR_v6_chromosomes.head()

In [6]:
# Get correct records from each haplotype.
occ1_chromosomal_records = get_chromosomal_scaffolds(occ1_fasta, TrR_v6_chromosomes, sg = 'Occ', hap = 'One')
occ2_chromosomal_records = get_chromosomal_scaffolds(occ2_fasta, TrR_v6_chromosomes, sg = 'Occ', hap = 'Two')
pall1_chromosomal_records = get_chromosomal_scaffolds(pall1_fasta, TrR_v6_chromosomes, sg = 'Pall', hap = 'One')
pall2_chromosomal_records = get_chromosomal_scaffolds(pall2_fasta, TrR_v6_chromosomes, sg = 'Pall', hap = 'Two')

# Merge dictionaries
all_chromosomal_records = occ1_chromosomal_records | occ2_chromosomal_records | pall1_chromosomal_records | pall2_chromosomal_records
all_chromosomal_records

## Step 2: Reverse compliment and reorder

In [7]:
# Chromosomes that need to be reverse complement to match orientation of Griffiths et al. TrR_v5 genome
# Determined by mapping scaffolds against the griffiths genome using minimap2 and looking for a negative correlation in the alignment positions
to_reverse_complement = pd.read_csv(chrs_toRevComp, delimiter=',')['TrR_v6_chromosome_name'].tolist()
all_chromosomal_records_revComp = reverse_compliment(all_chromosomal_records)

In [8]:
# Reorder chromosomes to match order in CSV file, which matches Griffiths TrR_v5 assembly
chromosome_order = TrR_v6_chromosomes['TrR_v6_chromosome_name'].tolist()
all_chromosomal_records_revComp_ordered = {chrom : all_chromosomal_records_revComp[chrom] for chrom in chromosome_order if chrom in all_chromosomal_records_revComp}
all_chromosomal_records_revComp_ordered

## Step 3: Add organellar sequences and write file

In [9]:
# Hapltoypes and scaffold names for organelles taken from best BLAST hits of entire plastid or COI (for Mitochondria)
# Hits were identical between haplotypes for plastid, but Pall 1 had longer assembly for Mitochondria
plastid_record = get_organellar_scaffolds(occ1_fasta, 'Scaffold_27__1_contigs__length_126578', 'Plastid')
mitochondrial_record = get_organellar_scaffolds(pall1_fasta, 'Scaffold_15__1_contigs__length_370591', 'Mitochondria')

# Adde records to end of dictionary
haploid_reference_genome = all_chromosomal_records_revComp_ordered | plastid_record | mitochondrial_record
haploid_reference_genome

In [10]:
# Write fasta
with open(haploid_reference, 'w') as fout:
    for chrom, seq in haploid_reference_genome.items():
        fout.write(f'>{chrom}\n{str(seq)}\n')