In [None]:
import numpy as np
import pandas as pd
from Bio import SeqIO
from Bio import AlignIO
from Bio.Align import AlignInfo
from Bio import SeqRecord
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.cm import viridis
import itertools
from collections import Counter
import re
import os

# Methods section 2.3 and 2.4

## derive representative Ty sequences from the CBS432 genome

In [None]:
# From assembly, export annotated Ty sequences. To be aligned externally by Muscle
#parse genome
genome_CBS432 = {}
with open('/home/mathieu/mhenault_landrylab/Sequences/ref_genomes/CBS432_pacbio/CBS432.genome.fa') as handle:
    for seq in SeqIO.parse(handle, 'fasta'):
        genome_CBS432[seq.id] = seq
#parse nanotations
annot_CBS432 = pd.read_csv('/home/mathieu/mhenault_landrylab/Sequences/ref_genomes/CBS432_pacbio/CBS432.all_feature.gff', sep='\t', header=None)
families_CBS432 = ['TY1','TY3','TY5']
fam_seq_CBS432 = {}
for TE, df in annot_CBS432.groupby(2):
    if TE in families_CBS432:
        fam_seq_CBS432[TE] = {}
        idx = 0
        for i in df.index:
            chrom, start, end, strand = df.loc[i, [0,3,4,6]]
            ID = f'{idx}-{chrom}-{TE}-{start}-{end}'
            seq = genome_CBS432[chrom].seq[start-1:end]
            if strand == '-':
                seq = seq.reverse_complement()
            fam_seq_CBS432[TE][ID] = SeqRecord.SeqRecord(seq=seq, id=ID, description='')
            idx += 1
        #with open(f'/Users/mathieu/mhenault_landrylab/md_ty_expression/db/{TE}.CBS432.fasta', 'w') as handle:
        #    SeqIO.write(fam_seq_CBS432[TE].values(), handle, 'fasta')

In [None]:
# import alignments from muscle and generate consensus, to be exported and blasted against annotated sequences
alignments_CBS432 = {}
for fam, t in itertools.product(families_CBS432, ['internal','LTR']):
    with open(f'/home/mathieu/mhenault_landrylab/md_ty_expression/db/{fam}.{t}.CBS432.muscle.fas') as handle:
        a = AlignIO.read(handle, 'fasta')
    cons = AlignInfo.SummaryInfo(a).dumb_consensus(threshold=0.5, ambiguous='N')
    ID = f'cons.{fam}.{t}'
    a.add_sequence(ID, str(cons))
    alignments_CBS432[ID] = a
    #with open(f'/home/mathieu/mhenault_landrylab/md_ty_expression/db/{fam}.{t}.CBS432.cons.fasta', 'w') as handle:
    #    AlignIO.write(a, handle, 'fasta')

In [None]:
# import blastn results of consensus against annotated sequences
# pick winner as representative seq
consensus = []
for fam, t in itertools.product(families_CBS432, ['internal','LTR']):
    ID = f'cons.{fam}.{t}'
    blastn = pd.read_csv(f'/home/mathieu/mhenault_landrylab/md_ty_expression/db/{fam}.{t}.CBS432.cons.blastn.tab', header=None, sep='\t')
    
    blastn = blastn.loc[(blastn[0]==ID) & (blastn[1]!=ID)].sort_values(by=11, ascending=False)
    winner = blastn.iloc[0,1]
    # reopen alignment
    with open(f'/home/mathieu/mhenault_landrylab/md_ty_expression/db/{fam}.{t}.CBS432.muscle.fas') as handle:
        a = AlignIO.read(handle, 'fasta')
    a = {seq.id: seq for seq in a}
    #add to consensus
    
    seq = a[winner]
    seq.seq = seq.seq.ungap()
    print(winner, set(str(seq.seq)))
    consensus.append(seq)
#with open('/home/mathieu/mhenault_landrylab/md_ty_expression/db/CBS432.consensus.fasta', 'w') as handle:
#    SeqIO.write(consensus, handle, 'fasta')

In [None]:
# generate reference sequences with LTR+I for each Spar and Scer genome

with open('/home/mathieu/mhenault_landrylab/md_ty_expression/db/CBS432.consensus.fasta', 'r') as handle:
    cons_sp = {seq.id:seq for seq in SeqIO.parse(handle, 'fasta')}
with open('/home/mathieu/paradoxus_nanopore/paradoxus4/db/tyfamilies_saccharomyces_pone2012.txt', 'r') as handle:
    cons_sc = {seq.id.split('#')[0]:seq for seq in SeqIO.parse(handle, 'fasta')}
cons = {**cons_sp, **cons_sc}

cons_ltr_internal = []
for alias, (ltr, internal) in zip(['Ty1_Sp','Ty3_Sp','Ty5_Sp','Ty1_Sc','Ty2_Sc','Ty3_Sc','Ty4_Sc','Ty5_Sc'],
                                  [('4-chrVIII-TY1-4262-10147-LTR2', '0-chrIV-TY1-1300771-1306656'),
                                   ('1-chrXI-TY3-81086-86445-LTR2', '1-chrXI-TY3-81086-86445'),
                                   ('1-chrIII-TY5-9167-14546-LTR1', '1-chrIII-TY5-9167-14546'),
                                   ('TY1-LTR', 'TY1-I'),
                                   ('TY2-LTR', 'TY2-I'),
                                   ('TY3-LTR', 'TY3-I'),
                                   ('TY4-LTR', 'TY4-I'),
                                   ('TY5-LTR', 'TY5-I')]):
    cons_ltr_internal.append(SeqRecord.SeqRecord(seq=(cons[ltr].seq+cons[internal].seq).upper(), id=alias, description=''))

with open('/Users/mathieu/mhenault_landrylab/md_ty_expression/db/cons_ltr_internal.fasta', 'w') as handle:
    SeqIO.write(cons_ltr_internal, handle, 'fasta')

In [None]:
# get length of each LTR
for alias, (ltr, internal) in zip(['Ty1_Sp','Ty3_Sp','Ty5_Sp','Ty1_Sc','Ty2_Sc','Ty3_Sc','Ty4_Sc','Ty5_Sc'],
                                  [('4-chrVIII-TY1-4262-10147-LTR2', '0-chrIV-TY1-1300771-1306656'),
                                   ('1-chrXI-TY3-81086-86445-LTR2', '1-chrXI-TY3-81086-86445'),
                                   ('1-chrIII-TY5-9167-14546-LTR1', '1-chrIII-TY5-9167-14546'),
                                   ('TY1-LTR', 'TY1-I'),
                                   ('TY2-LTR', 'TY2-I'),
                                   ('TY3-LTR', 'TY3-I'),
                                   ('TY4-LTR', 'TY4-I'),
                                   ('TY5-LTR', 'TY5-I')]):
    print(alias, len(cons[ltr].seq), len(cons[internal].seq), len(cons[ltr].seq)+len(cons[internal].seq))

In [None]:
# add nuclear genomes to Ty sequences, while masking the former for annotated Tys
def parse_mask(S, s, name_append):
# parse chromosome and hard-mask
    genome_nuc = []
    annot = pd.read_csv(f'/Users/mathieu/mhenault_landrylab/Sequences/ref_genomes/{S}_pacbio/{s}.all_feature.gff', sep='\t', comment='#', header=None)
    with open(f'/Users/mathieu/mhenault_landrylab/Sequences/ref_genomes/{S}_pacbio/{s}.genome.fa') as handle:
        for seq in SeqIO.parse(handle, 'fasta'):
            Seq = seq.seq.tomutable()

            # hard-mask anu Ty annotation
            for i in annot.loc[(annot[0]==seq.id) & (annot[2].apply(lambda x: 'TY' in x))].index:
                start, end = annot.loc[i, [3,4]]
                Seq[start-1:end] = 'N'*(end-start+1)
            Seq = Seq.toseq()

            genome_nuc.append(SeqRecord.SeqRecord(seq=Seq, id=f'{seq.id}_{name_append}', description=''))
    return genome_nuc

genome_nuc_CBS432 = parse_mask('CBS432','CBS432','Sp')
genome_nuc_S288c = parse_mask('S288C','S288c','Sc')

# export complete ref
with open('/Users/mathieu/mhenault_landrylab/md_ty_expression/db/genome_Sp_Sc_Ty.fasta', 'w') as handle:
    SeqIO.write(genome_nuc_CBS432+genome_nuc_S288c+cons_ltr_internal, handle, 'fasta')
# export ref with only the Sc Tys
with open('/Users/mathieu/mhenault_landrylab/md_ty_expression/db/genome_Sc_Ty.fasta', 'w') as handle:
    SeqIO.write(genome_nuc_CBS432+genome_nuc_S288c+cons_ltr_internal[3:], handle, 'fasta')

In [None]:
# define parent of genes as their ID to be able to group by Parent
def get_parent(x):
    patterns = ['(\w+_\d{2})[GT](\d{5})', '(\w+_Ty\d)[GT](1)']
    value = 'none'
    for p in patterns:
        m = re.match(p, x)
        if m:
            value = f'{m.group(1)}.{m.group(2)}'
            break
    return value

In [None]:
# parse coordinates of LTRs and CDS for working with depth
Coords = pd.read_csv('/home/mathieu/mhenault_landrylab/md_ty_expression/db/ltr_cds_coords.csv', index_col=0)
Coords = Coords.drop('Tsu4', axis=0)

# prepare annotation files for analysis by Plastid

In [None]:
# for the analysis of periodicity
# add pseudo-5' UTR of 50 pb to allow five prime offsets
Annot_plastid = []
tig_len = {}
for S, s, alias in [('CBS432','CBS432','Sp'), ('S288C','S288c','Sc')]:
    annot = pd.read_csv(f'/home/mathieu/mhenault_landrylab/Sequences/ref_genomes/{S}_pacbio/{s}.all_feature.gff', sep='\t', comment='#', header=None)
    Annot_plastid.append(annot)
    with open(f'/home/mathieu/mhenault_landrylab/Sequences/ref_genomes/{S}_pacbio/{s}.genome.fa') as handle:
        tig_len[s] = {tig.id:len(tig.seq) for tig in SeqIO.parse(handle, 'fasta')}
# add Ty annotations
Annot_plastid.append(pd.read_csv('/home/mathieu/mhenault_landrylab/md_ty_expression/db/annot_ty.gff', sep='\t', comment='#', header=None))
Annot_plastid = pd.concat(Annot_plastid).reset_index(drop=True)
Annot_plastid = pd.concat([Annot_plastid, Annot_plastid[8].apply(lambda x: pd.Series(dict([i.split('=') for i in x.split(';')])))], axis=1)
Annot_plastid['parent'] = Annot_plastid['ID'].apply(lambda x: get_parent(x))

for (p, tig, s, strand), df in Annot_plastid.groupby(['parent',0,1,6]):
    
    # sort subtables to get the first entries depending on orientation
    if strand == '-':
        df_sort = df.sort_values(by=4, ascending=False)
    if strand == '+':
        df_sort = df.sort_values(by=3, ascending=True)
        
    # iterate and find the first instance of each ['mRNA', 'exon']
    df_idx = []
    for f in ['mRNA','exon']:
        for i in df_sort.index:
            if df_sort.loc[i, 2] == f:
                df_idx.append(i)
                break
    
    # modify coords accordingly
    if strand == '-':
        tl = tig_len[s][tig]
        Annot_plastid.loc[df_idx, 4] = np.where(df.loc[df_idx, 4]+50<=tl, df.loc[df_idx, 4]+50, tl)
    elif strand == '+':
        Annot_plastid.loc[df_idx, 3] = np.where(df.loc[df_idx, 3]-50>=1, df.loc[df_idx, 3]-50, 1)
    if s == 'CBS432':
        Annot_plastid.loc[df.index, 0] = df[0].apply(lambda x: f'{x}_Sp')
    if s == 'S288c':
        Annot_plastid.loc[df.index, 0] = df[0].apply(lambda x: f'{x}_Sc')

#Annot_plastid.loc[Annot_plastid['parent']!='none'].iloc[:, :9].to_csv('/Users/mathieu/mhenault_landrylab/md_ty_expression/plastid/annot_ty_plastid.gff', sep='\t', header=None, index=None)

In [None]:
# prepare gff file for total mRNA quantification
# need to consider that Tys will only be scored on internal seq
Annot_mrna = []

for S, s, alias in [('CBS432','CBS432','Sp'), ('S288C','S288c','Sc')]:
    annot = pd.read_csv(f'/home/mathieu/mhenault_landrylab/Sequences/ref_genomes/{S}_pacbio/{s}.all_feature.gff', sep='\t', comment='#', header=None)
    Annot_mrna.append(annot)
        
Annot_mrna.append(pd.read_csv('/home/mathieu/mhenault_landrylab/md_ty_expression/db/annot_ty.gff', sep='\t', comment='#', header=None))
Annot_mrna = pd.concat(Annot_mrna).reset_index(drop=True)

Annot_mrna = pd.concat([Annot_mrna, Annot_mrna[8].apply(lambda x: pd.Series(dict([i.split('=') for i in x.split(';')])))], axis=1)

Annot_mrna['parent'] = Annot_mrna['ID'].apply(lambda x: get_parent(x))

for s, df in Annot_mrna.groupby(1):
    
    if s == 'CBS432':
        Annot_mrna.loc[df.index, 0] = df[0].apply(lambda x: f'{x}_Sp')
    if s == 'S288c':
        Annot_mrna.loc[df.index, 0] = df[0].apply(lambda x: f'{x}_Sc')

parent_alias = {'CBS432':'Sp', 'S288c':'Sc'}
ty_tig_alias = {s: f'{s.split("_")[1][:-2]}_{parent_alias[s.split("_")[0]]}' for s in Coords.index}

# extract Ty elms and change start coord to only consider internal sequences
for p, df in Annot_mrna.groupby('parent'):
    if 'Ty' in p:
        start, end = Coords.loc[p.replace('.','T'), ['ltr_end','cds_end']]
        Annot_mrna.loc[df.index, [3,4]] = (start+1, end)

#Annot_mrna.loc[Annot_mrna['parent']!='none'].iloc[:, :9].to_csv('/Users/mathieu/mhenault_landrylab/md_ty_expression/plastid/annot_ty_plastid_mrna.gff', sep='\t', header=None, index=None)

In [None]:
# prepare gff file for total mRNA quantification, but without Ty1 GAG sequence
# same as mrna, but excluding the Ty1 gag part
Annot_Ty1 = Annot_mrna.copy()

# extract Ty1 elms and change start coord
for p, df in Annot_Ty1.groupby('parent'):
    if 'Ty1' in p:
        start, end = Coords.loc[p.replace('.','T'), ['gag_end','cds_end']]
        Annot_Ty1.loc[df.index, [3,4]] = (start+1, end)

#Annot_Ty1.loc[Annot_Ty1['parent']!='none'].iloc[:, :9].to_csv('/Users/mathieu/mhenault_landrylab/md_ty_expression/plastid/annot_ty_plastid_Ty1.gff', sep='\t', header=None, index=None)

In [None]:
# add start and end coords for the quantification on the first 60 nts
Annot_60nt = Annot_mrna.copy()
idx_keep = []
for (p, tig, s, strand), df in Annot_60nt.groupby(['parent',0,1,6]):
    
    start, end = df.iloc[0, [3,4]]

    # sort subtables to get the first entries depending on orientation
    if strand == '-':
        df_sort = df.sort_values(by=4, ascending=False)
        start = end - 60
    if strand == '+':
        df_sort = df.sort_values(by=3, ascending=True)
        end = start + 60
        
    # iterate and find the first instance of each type
    df_idx = []
    for f in ['gene','mRNA','exon','CDS']:
        for i in df_sort.index:
            if df_sort.loc[i, 2] == f:
                df_idx.append(i)
                break
    # append indexes to keep for the final clanup
    idx_keep.extend(df_idx)
    
    if strand == '+':
        Annot_60nt.loc[df_idx, 4] = end
    elif strand == '-':
        Annot_60nt.loc[df_idx, 3] = start

Annot_60nt = Annot_60nt.loc[idx_keep]

#Annot_60nt.loc[Annot_60nt['parent']!='none'].iloc[:, :9].to_csv('/Users/mathieu/mhenault_landrylab/md_ty_expression/plastid/annot_ty_plastid_60nt.gff', sep='\t', header=None, index=None)

## analyse the nucleotide identity between Sp and Sc Ty1 and Ty3

In [None]:
# export annotation sequences

for TE in ['TY1','TY3']:

    seqs = {}

    for S, s in zip(['S288C','CBS432'], ['S288c','CBS432']):
        genome = {}
        with open(f'/home/mathieu/mhenault_landrylab/Sequences/ref_genomes/{S}_pacbio/{s}.genome.fa') as handle:
            for seq in SeqIO.parse(handle, 'fasta'):
                genome[seq.id] = seq
        annot = pd.read_csv(f'/home/mathieu/mhenault_landrylab/Sequences/ref_genomes/{S}_pacbio/{s}.all_feature.gff', sep='\t', header=None)

        for i in annot.loc[annot[2]==TE].index:
            chrom, start, end, strand = annot.loc[i, [0,3,4,6]]
            ID = f'{s}-{chrom}-{TE}-{start}-{end}'
            seq = genome[chrom].seq[start-1:end]
            if strand == '-':
                seq = seq.reverse_complement()
            seqs[ID] = SeqRecord.SeqRecord(seq=seq, id=ID, description='')

    #with open(f'/home/mathieu/mhenault_landrylab/md_ty_expression/db/{TE}.fl.fasta', 'w') as handle:
    #    SeqIO.write(seqs.values(), handle, 'fasta')

In [None]:
#define function for identity scoring of one position in a pair 
#of sequences from a MSA
def compare_position(x, gap=np.nan):
    if set(x) == '-':
        return -1
    elif '-' in x:
        return -1
    elif x[0] != x[1]:
        return 0
    else:
        return 1

#define function to summarize identity along a pair of sequences 
# in a MSA
def compare_sequences(sub_a, window, sliding='half'):
    cp = np.apply_along_axis(compare_position, 0, sub_a)
    cp = np.where(cp!=-1, cp, np.nan)
    
    if sliding == 'ovl':
        return np.array([np.nanmean(cp[i:i+window]) for i in range(cp.shape[0]-window)])
    elif sliding == 'half':
        half_window = int(window/2)
        return np.array([np.nanmean(cp[i:i+window]) for i in range(0, cp.shape[0]-half_window, half_window)])
    elif sliding == 'non-ovl':
        return np.array([np.nanmean(cp[i:i+window]) for i in range(0, cp.shape[0]-window, window)])
        

In [None]:
# import alignments and generate consensus

SC = {}
window = 100
for TE in ['TY1','TY3']:

    with open(f'/home/mathieu/mhenault_landrylab/md_ty_expression/db/{TE}.fl.muscle.fasta') as handle:
        a = AlignIO.read(handle, 'fasta')

    sequence_comparisons = {}
    idx = 0
    with ProgressBar(max_value=360) as bar:
        for s_pair in itertools.combinations(a, 2):
            species_pair = [x.id.split('-')[0] for x in s_pair]
            if len(set(species_pair)) > 1:
                sub_a = np.array([x.seq for x in s_pair])
                sequence_comparisons['.'.join([x.id for x in s_pair])] = compare_sequences(sub_a, window, sliding='non-ovl')

                idx += 1
                bar.update(idx)

    
    df = pd.DataFrame(sequence_comparisons).T
    df.columns = (df.columns+0.5)*window
    SC[TE] = df

In [None]:
# import aln coords of annotations
coords_aln = pd.read_csv('/home/mathieu/mhenault_landrylab/md_ty_expression/db/ltr_cds_aln_coords.csv', index_col=0)

In [None]:
#plot
fig, axes = plt.subplots(nrows=2, figsize=[6,5], 
                         gridspec_kw={'hspace':1,'left':0.11,'top':0.92,'bottom':0.2,'right':0.97})

for TE, alias, alpha, ax in zip(['TY1', 'TY3'], ['Ty1','Ty3'], [0.05,1], axes):
    dat = pd.melt(SC[TE])
    
    ax.plot(dat['variable'], dat['value'], lw=0, marker='o', ms=1, alpha=alpha, c='k', label=TE)
    ax.plot(SC[TE].columns, SC[TE].apply(lambda x: np.nanmean(x), axis=0), c='k')
    
    
    ax.axhline(1, ls='--', lw=0.5, c='k')
    
    ax.set_title(alias)
    ax.set_yticks(np.arange(0.6,1.01,0.1))
    ax.set_yticklabels(np.arange(60,101,10))
    ax.margins(0.01)
    ax.set_ylim(0.6, 1.03)
    ax.set_ylabel('% nt identity')
    
    
    coords = coords_aln.loc[TE]
    
    # add patch for five prime LTR
    ltr1 = Rectangle([0, 0.32], coords['ltr_end'], 0.05, color='0.3', zorder=0, clip_on=False)
    ax.add_patch(ltr1)
    ltr2 = Rectangle([coords['ltr_start'], 0.32], coords['end']-coords['ltr_start'], 0.05, color='0.3', zorder=0, clip_on=False)
    ax.add_patch(ltr2)
    # add patch for internal sequence
    internal = Rectangle([coords['ltr_end'], 0.32], coords['ltr_start']-coords['ltr_end'], 0.05, color='0.6', zorder=0, clip_on=False)
    ax.add_patch(internal)
    # add line for POL
    pol = FancyArrow(coords['gag_start'], 0.4, coords['pol_end']-coords['gag_start'], 0, clip_on=False, zorder=1, width=0.04, head_width=0.04, fc='w', ec='k', lw=0.5,
                     head_length=50, length_includes_head=True)
    ax.add_patch(pol)
    ax.text(0.5*(coords['pol_end']+coords['gag_start']), 0.40, '$POL$', va='center', ha='center', color='k', zorder=2, size=6)
    # add line for GAG
    pol = FancyArrow(coords['gag_start'], 0.45, coords['gag_end']-coords['gag_start'], 0, clip_on=False, zorder=1, width=0.04, head_width=0.04, fc='w', ec='k', lw=0.5,
                    head_length=50, length_includes_head=True)
    ax.add_patch(pol)
    ax.text(0.5*(coords['gag_end']+coords['gag_start']), 0.45, '$GAG$', va='center', ha='center', color='k', zorder=2, size=6)

fig.text(0.02, 0.95, 'A', size=18)
fig.text(0.02, 0.47, 'B', size=18)
    
sns.despine()
plt.savefig('/home/mathieu/mhenault_landrylab/md_ty_expression/fig/fig_final/FigS2.jpg', dpi=300)
plt.show()
plt.close()