In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib import cm
from matplotlib.lines import Line2D
from matplotlib.patches import Rectangle
from matplotlib.patches import FancyArrow
from matplotlib.colors import ListedColormap
import seaborn as sns
import re
from os import listdir
import itertools
from progressbar import ProgressBar
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
sns.set(style='ticks', font='Arial')

# Methods sections 2.6 and 2.7
## import annotations and sample descriptions

In [None]:
# import table describing library accessions
samples = pd.read_csv('/home/mathieu/mhenault_landrylab/md_ty_expression/hisat2/SraRunTable.txt', header=None)
samples = pd.concat([samples, samples[22].apply(lambda x: \
                                                pd.Series(re.match('(.+) ([Ribo|RNA]+-seq) (Rep \d)', x).groups(),\
                                                          index=['species','experiment','rep']))], axis=1)
samples.index = samples[0].values
experiment_alias_r = {'RNA-seq':'rna', 'Ribo-seq':'rpf'}
species_alias_r = dict(zip(['S. cerevisiae', 'S. paradoxus', 'S. cerevisiae / S. paradoxus F1 Hybrid'],
                           ['Scer', 'Spar', 'hyb']))
samples['Species'] = samples['species'].apply(lambda x: species_alias_r[x])
samples['Experiment'] = samples['experiment'].apply(lambda x: experiment_alias_r[x])
samples['Rep'] = samples['rep'].apply(lambda x: f'rep{x[-1]}')

samples[[0,'Species','Experiment','Rep']].to_csv('/home/mathieu/mhenault_landrylab/md_ty_expression/deseq2/samples.tab', sep='\t', index=None)

In [None]:
# import gff annotations fed to Plastid
Annot_plastid = pd.read_csv('/home/mathieu/mhenault_landrylab/md_ty_expression/plastid/annot_ty_plastid.gff', sep='\t', header=None)
Annot_plastid = pd.concat([Annot_plastid, Annot_plastid[8].apply(lambda x: pd.Series(dict([i.split('=') for i in x.split(';')])))], axis=1)

# define parent of genes as their ID to be able to group by Parent
def get_parent(x):
    patterns = ['(\w+_\d{2})[GT](\d{5})', '(\w+_Ty\d)[GT](1)']
    value = 'none'
    for p in patterns:
        m = re.match(p, x)
        if m:
            value = f'{m.group(1)}.{m.group(2)}'
            break
    return value

Annot_plastid['parent'] = Annot_plastid['ID'].apply(lambda x: get_parent(x))

In [None]:
# parse coordinates of LTRs and CDS per family
Coords = pd.read_csv('/home/mathieu/mhenault_landrylab/md_ty_expression/db/ltr_cds_coords.csv', index_col=0)

In [None]:
# define aliases for subgenomes and Ty pseudochromosomes
parent_alias = {'CBS432':'Sp', 'S288c':'Sc'}
ty_tig_alias = {s: f'{s.split("_")[1][:-2]}_{parent_alias[s.split("_")[0]]}' for s in Coords.index if s != 'Tsu4'}

## import results

In [None]:
# parse plastid count vectors; depth of coverage per position
idx = 0
Counts = {}
with ProgressBar(max_value=11109*12) as bar:
    #for s in samples.loc[samples['experiment']=='Ribo-seq'].index:
    for s in samples.index:
        # define dict to store coverage vectors
        counts = {}
        # get dir
        Dir = f'/home/mathieu/mhenault_landrylab/md_ty_expression/plastid/{s}/'
        files = listdir(Dir)
        for f in files:
            m = re.match('(SRR\d{6})_((?:S288c|CBS432)_.+[G|T]\d+)\.(\d)', f)
            if m:
                S, gene, transcript = m.groups()
                #with open(, 'r') as handle:
                counts[gene] = pd.read_csv(f'{Dir}{f}', header=None, squeeze=True)
                idx += 1
                bar.update(idx)
        Counts[s] = counts

In [None]:
# import count data from plastid; read counts per gene
CT = []
for m in ['mrna','60nt','Ty1']:
    for s in samples.index:
        path = f'/home/mathieu/mhenault_landrylab/md_ty_expression/plastid/{s}/{s}.{m}.counts'
        ct = pd.read_csv(path, sep='\t', header=0, index_col=None, comment='#')
        ct['sample'] = s
        ct['metric'] = m
        CT.append(ct)
CT = pd.concat(CT).reset_index(drop=True)
CT['gene'] = CT['region_name'].apply(lambda x: x.split('.')[0])
for f in ['species','experiment','rep']:
    CT[f] = samples.loc[CT['sample'], f].values

CT['RPK'] = CT['counts']/CT['length']
for s, df in CT.groupby('sample'):
    CT.loc[df.index, 'TPM'] = 1e6*df['RPK']/df['RPK'].sum()
CT['log_TPM'] = np.log2(CT['TPM']+1)

# produce output tables of raw for DEseq
#for m, df in CT.groupby('metric'):
#    dat = df.pivot_table(index='region_name', columns='sample', values='counts', aggfunc=lambda x: int(x))
#    dat.to_csv(f'/home/mathieu/mhenault_landrylab/md_ty_expression/deseq2/{m}.counts.tab',sep='\t')

In [None]:
# reformat counts into table with TPM for each replicate
TPM = CT.pivot_table(index=['species', 'rep', 'metric', 'region_name'], columns='experiment', values='log_TPM').reset_index()
TPM['region_name'] = TPM['region_name'].apply(lambda x: x.split('.')[0])
tpm = CT.loc[CT['metric']=='mrna']
tpm = pd.Series(tpm['log_TPM'].values, index=pd.MultiIndex.from_frame(tpm[['gene','sample']]))

In [None]:
# import results from DESeq

DESEQ = []
path = '/home/mathieu/mhenault_landrylab/md_ty_expression/deseq2/'
for f in listdir(path):
    m = re.match('results\.(S288c|CBS432)\.(\w+)\.([a-z_]+)\.(.+)\.csv', f)
    if m:
        subg, metric, lfc, pred = m.groups()
        
        deseq = pd.read_csv(f'{path}{f}', index_col=0)
        deseq['gene'] = deseq.index
        deseq['subgenome'] = subg
        deseq['metric'] = metric
        deseq['lfc'] = lfc
        deseq['pred'] = pred
        
        DESEQ.append(deseq)
DESEQ = pd.concat(DESEQ).reset_index(drop=True)
DESEQ['log_padj'] = DESEQ['padj'].apply(lambda x: -1*np.log10(x))
DESEQ['gene'] = DESEQ['gene'].apply(lambda x: x.split('.')[0])

In [None]:
# import read counts corrected by DESeq normalization
counts_norm = []
for subg in ['S288c','CBS432']:
    data = pd.read_csv(f'/home/mathieu/mhenault_landrylab/md_ty_expression/deseq2/norm_counts.{subg}.mrna.csv', index_col=0)
    data = pd.melt(data.reset_index(), id_vars='index', var_name='sample', value_name='count_norm')
    data['gene'] = data['index'].apply(lambda x: x.split('.')[0])
    data['log_count_norm'] = np.log10(data['count_norm']+1)
    data['subgenome'] = subg
    counts_norm.append(data)
    
counts_norm = pd.concat(counts_norm)
counts_norm['species'] = samples.loc[counts_norm['sample'], 'species'].values
counts_norm['exp'] = samples.loc[counts_norm['sample'], 'experiment'].values
counts_norm['Rep'] = samples.loc[counts_norm['sample'], 'Rep'].values

In [None]:
# import data from DESeq of the RNA-seq only; splitted deseq analysis

DESEQ_RNA_SPLIT = []
path = '/home/mathieu/mhenault_landrylab/md_ty_expression/deseq2/'
for f in listdir(path):
    m = re.match('results\.(\w+)\.([a-z_]+)\.(.+)\.rnaseq\.csv', f)
    if m:
        subgenome, lfc, pred = m.groups()
        
        deseq = pd.read_csv(f'{path}{f}', index_col=0)
        deseq['gene'] = deseq.index
        deseq['subgenome'] = subgenome
        deseq['lfc'] = lfc
        deseq['pred'] = pred 
        
        DESEQ_RNA_SPLIT.append(deseq)
DESEQ_RNA_SPLIT = pd.concat(DESEQ_RNA_SPLIT).reset_index(drop=True)
DESEQ_RNA_SPLIT['log_padj'] = DESEQ_RNA_SPLIT['padj'].apply(lambda x: -1*np.log10(x))
DESEQ_RNA_SPLIT['gene'] = DESEQ_RNA_SPLIT['gene'].apply(lambda x: x.split('.')[0])

# plots

In [None]:
# plotting dicts
species_encoding = {'Spar':2, 'Scer':0, 'hyb':1}
species_order = ['S. cerevisiae', 'S. cerevisiae / S. paradoxus F1 Hybrid', 'S. paradoxus']
species_alias = dict(zip(species_order, ['Sc', 'Sc x Sp', 'Sp']))
species_color = dict(zip(species_order, ['#0925CF','#07CB18','#F7A220']))

Parent_alias = {'CBS432':'par', 'S288c':'cer'}

exp_encoding = {'rna':0, 'rpf':1}
exp_alias = {'RNA-seq':'mRNA','Ribo-seq':'RPF'}
exp_cmap = ListedColormap(['skyblue','steelblue'])
exp_markers = {'RNA-seq':'+', 'Ribo-seq':'x'}

rep_encoding = {'rep1':0, 'rep2':1}
rep_cmap = ListedColormap(['0.2','0.8'])

te_order = ['S288c_Ty1T1', 'S288c_Ty2T1', 'S288c_Ty3T1', 'S288c_Ty4T1', 'CBS432_Ty1T1', 'CBS432_Ty3T1', 'CBS432_Ty5T1']
Te_order = ['S288c_Ty1T1', 'S288c_Ty2T1', 'S288c_Ty3T1', 'S288c_Ty4T1', 'CBS432_Ty3T1', 'CBS432_Ty5T1']
te_alias = {s: f'{s.split("_")[1][:-2]}_{Parent_alias[s.split("_")[0]]}' for s in te_order}
te_color = dict(zip(te_order, [cm.tab10(i) for i in range(8)]))
te_marker = dict(zip(te_order, ['s','o','^','D','s','^','v']))
te_mfc = dict(zip(te_order, ['#07CB18']*4 + [(0,0,0,0)]*3))

metric_order = ['60nt', 'mrna', 'Ty1']
metric_alias = dict(zip(metric_order, ['60 nt', 'full CDS', 'full CDS/Ty1-$POL$']))

In [None]:
# plot heatmaps of coverage depth z-score per bin, along reference sequences. Fig S1
S = samples.copy()
S['se'] = S['Species'].apply(lambda x: species_encoding[x])
S = S.sort_values(by=['Experiment','se','Rep']).index
S_labels = samples.loc[S].apply(lambda x: f'{exp_alias[x["experiment"]]} {species_alias[x["species"]]} {x["Rep"]}', axis=1)
wdw = 75

# define function to standardize slice of counts
def norm_slice(x):
    std = x.std()
    if std==0:
        return pd.Series(np.repeat(np.nan, x.shape[0]), index=x.index)
    else:
        return (x-x.mean())/x.std()
    
fig = plt.figure(figsize=[7.8,9])
gs = plt.GridSpec(nrows=3, ncols=3, hspace=0.6, wspace=0.8, left=0.12, right=0.96, top=0.96, bottom=0.08)

for te, (i,j) in zip(te_order, itertools.product(range(3), range(3))):

    Dat = []
    for s in S:
        dat = pd.DataFrame(Counts[s][te])
        dat.columns = ['count']
        dat['sample'] = s
        dat['pos'] = range(dat.shape[0])
        dat['bin'] = pd.cut(dat['pos'], bins=pd.interval_range(start=0, end=dat.shape[0]+wdw, freq=wdw, closed='left'))
        Dat.append(dat)
    Dat = pd.concat(Dat)
    #dat_norm = Dat.groupby(['sample','bin']).mean().reset_index().pivot_table(index='sample', columns='bin', values='count').fillna(0).loc[S]
    dat_norm = Dat.groupby(['sample','bin']).mean().reset_index().pivot_table(index='sample', columns='bin', values='count').loc[S]
    dat_norm = dat_norm.apply(norm_slice, axis=1)    
    #print(te, np.quantile(dat_norm.values.reshape(-1,1), 0.02), np.quantile(dat_norm.values.reshape(-1,1), 0.98))

    ax = fig.add_subplot(gs[i,j])
  
    HM = ax.imshow(dat_norm, cmap='viridis', interpolation='none', aspect='auto', vmin=-1.4, vmax=3.5)
    ax.set_yticks(range(12))
    ax.set_yticklabels(S_labels, size=6)
    
    # add TPM
    ax.text(1.02, 12.25/12, 'log2\nTPM', va='bottom', size=6, transform=ax.transAxes)
    for pos, s in enumerate(S):
        lc = tpm.loc[(te,s)]
        ax.text(1.02, (11.5-pos)/12, f'{lc:.2f}', va='center', size=6, transform=ax.transAxes)
    
    fig.canvas.draw()
    tl = [int(t.get_text().replace('−','-'))*wdw for t in ax.get_xticklabels()]
    ax.set_xticklabels(tl)
    
    # plot TE symbol below heatmap
    
    base_below = dat_norm.shape[0]-1
    unit_below = 0.05*dat_norm.shape[0]
    
    xrange = ax.axis()[:2]
    coords = Coords.loc[te]
    slope = (xrange[1]-xrange[0])/(coords['cds_end']-coords.loc['mrna_start'])
    intercept = xrange[1]-(coords['cds_end']*slope)
    get_data_coord = np.vectorize(lambda x: x*slope+intercept)
    coords = get_data_coord(coords[:6])
    
    # add patch for five prime LTR
    ltr = Rectangle([-0.5, base_below+7*unit_below], coords[0], unit_below, color='0.3', zorder=0, clip_on=False)
    ax.add_patch(ltr)
    # add patch for internal sequence
    internal = Rectangle([coords[0], base_below+7*unit_below], coords[1]-coords[0], unit_below, color='0.6', zorder=0, clip_on=False)
    ax.add_patch(internal)
    # add line for POL
    pol = FancyArrow(coords[4], base_below+6*unit_below, coords[1]-coords[4], 0, clip_on=False, zorder=1, width=unit_below, head_width=unit_below, fc='w', ec='k', lw=0.5,
                     length_includes_head=True)
    ax.add_patch(pol)
    ax.text(0.5*(coords[1]-coords[4]), base_below+6*unit_below+0.05, '$POL$', va='center', ha='center', color='k', zorder=2, size=6)
    
    if 'Ty5' not in te:
    # add line for GAG
        pol = FancyArrow(coords[4], base_below+4.5*unit_below, coords[2]-coords[4], 0, clip_on=False, zorder=1, width=unit_below, head_width=unit_below, fc='w', ec='k', lw=0.5,
                        length_includes_head=True)
        ax.add_patch(pol)
        ax.text(0.5*(coords[2]-coords[4]), base_below+4.5*unit_below+0.05, '$GAG$', va='center', ha='center', color='k', zorder=2, size=6)
    
    #for sp in ['left','right', 'top', 'bottom']:
    #    ax.spines[sp].set_visible(False)
    
    ax.set_title(te_alias[te], size=10)
    ax.set_xticks(np.arange(0,4501,1500)/75)
    ax.set_xticklabels(np.arange(0,4501,1500), size=8)

cbar_ax = fig.add_axes([0.55, 0.2, 0.25, 0.02])
cbar = plt.colorbar(HM, cax=cbar_ax, orientation='horizontal', label='coverage depth\nz-score')
cbar.outline.set_visible(False)
    
plt.savefig('/home/mathieu/mhenault_landrylab/md_ty_expression/fig/fig_final/FigS1.jpg', dpi=300)
plt.show()
plt.close()

In [None]:
# run PCA on count data
sc = StandardScaler()
pca = PCA()
subg_samples = {'S288c':['S. cerevisiae', 'S. cerevisiae / S. paradoxus F1 Hybrid'],
               'CBS432':['S. paradoxus', 'S. cerevisiae / S. paradoxus F1 Hybrid']}

In [None]:
# plot PCA results - Fig S3
fig = plt.figure(figsize=[7,5.5])
gs = plt.GridSpec(ncols=3, nrows=2, wspace=0.7, hspace=0.8, bottom=0.10, left=0.1, top=0.81, right=0.95)

for (subg_ax, subg), (metric_ax, m) in itertools.product(enumerate(['S288c', 'CBS432']), enumerate(metric_order)):
    
    sub_samples = subg_samples[subg]
    
    dat = CT.loc[(CT['region_name'].apply(lambda x: subg in x))
                 & (CT['metric']==m)
                 & (CT['species'].isin(sub_samples))]
    
    dat = dat.pivot_table(index='region_name', columns='sample', values='log_TPM', aggfunc=lambda x: x)
    
    annot_columns = [f'{species_alias[samples.loc[i,"species"]]} \
    {exp_alias[samples.loc[i,"experiment"]]} \
    {samples.loc[i,"rep"]}' for i in dat.columns]
    datn = pd.DataFrame(sc.fit_transform(dat.T.values), index=annot_columns)
    
    pca_fit = pca.fit(datn)
    res = pd.DataFrame(pca_fit.transform(datn), index=dat.columns)
    

    ax = fig.add_subplot(gs[subg_ax, metric_ax])

    # plot paired samples
    for (s, rep), df in samples.groupby(['species','rep']):
        if np.all(df.index.isin(res.index)):
            ax.plot(res.loc[df.index, 0], res.loc[df.index, 1], color=species_color[s])
    # plot experiment symbols
    for (s, exp), df in samples.groupby(['species','experiment']):
        if np.all(df.index.isin(res.index)):
            ax.scatter(res.loc[df.index, 0], res.loc[df.index, 1], color=species_color[s], marker=exp_markers[exp], s=48)
    
    ax.set_title(f'{species_alias[subg_samples[subg][0]]} subgenome\n{metric_alias[m]}', size=14)
    ax.set_xlabel(f'PC1 ({(pca_fit.explained_variance_ratio_[0]*100):.1f} %)')
    ax.set_ylabel(f'PC2 ({(pca_fit.explained_variance_ratio_[1]*100):.1f} %)')
    ax.margins(0.1)
    
ax = fig.add_axes([0.2,0.93,0.6,0.04])
ax.axis('off')
legend_elms = [Line2D([0],[0],color=species_color[s], label=species_alias[s]) for s in species_alias] +\
[Line2D([0],[0], color='w', marker=exp_markers[exp], mec='k', ms=8, label=exp_alias[exp]) for exp in exp_alias]
ax.legend(handles=legend_elms, ncol=5, loc=8, frameon=False)
    
sns.despine()
plt.savefig('/home/mathieu/mhenault_landrylab/md_ty_expression/fig/pca_rna_rpf.jpg', dpi=300)
plt.show()
plt.close()

In [None]:
# scatterplot of log TPM for mRNA-seq against RPF - Fig 5A
fig = plt.figure(figsize=[7.8,3.5])
gs = plt.GridSpec(ncols=3, nrows=2, height_ratios=[2,5], hspace=0.2, wspace=0.4, left=0.1, right=0.95, top=0.97, bottom=0.15)
for i, m in enumerate(metric_order):
    
    df = TPM.loc[TPM['metric']==m]
    ax = fig.add_subplot(gs[1,i])
    ax.plot(df['RNA-seq'], df['Ribo-seq'], color='0.9', marker='o', ms=5, lw=0, zorder=0)
    
    for (te, s), df1 in df.loc[df['region_name'].isin(Te_order)].groupby(['region_name','species']):
        if 'S288c' in te:
            mfc = species_color[s]
        if 'CBS432' in te:
            mfc = (0,0,0,0)
        ax.scatter(df1['RNA-seq'], df1['Ribo-seq'], color=mfc, edgecolors=species_color[s], marker=te_marker[te], s=48, zorder=2)

    ax.set_title(metric_alias[m], size=14)
    
    ax.set_xlabel('log2 TPM mRNA')
    if i == 0:
        ax.set_ylabel('log2 TPM RPF')
    #plot diagonal (reviewer comment)
    max_diag = min(ax.axis()[1], ax.axis()[3])
    ax.plot([0, max_diag], [0, max_diag], c='k', lw=0.5, zorder=1)
    
ax = fig.add_subplot(gs[0, :])
ax.axis('off')
te_fill = dict(zip(te_order, ['k']*4 + [(1,1,1,0)]*3))
legend_elms = [Line2D([0], [0], c='w', marker=te_marker[te], mec='k', mfc=te_fill[te], ms=8, label=te_alias[te]) for te in Te_order]
L1 = plt.legend(handles=legend_elms, ncol=6, loc=9, bbox_to_anchor=(0.5, 1), frameon=False, columnspacing=0.5)
legend_elms = [Line2D([0], [0], c='w', marker='s', mfc=species_color[s], ms=12, label=species_alias[s]) for s in species_order]
L2 = plt.legend(handles=legend_elms, ncol=3, loc=8, bbox_to_anchor=(0.5, 0), frameon=False)
ax.add_artist(L1)
    
#fig.text(0.02, 0.78, 'A', size=24, fontweight='bold')
    
sns.despine()
plt.savefig('/home/mathieu/mhenault_landrylab/md_ty_expression/fig/fig_final/Fig5A.jpg', dpi=300)
plt.show()
plt.close()

In [None]:
# stripcharts of deseq-normalized read counts per family - Fig 5B
fig = plt.figure(figsize=[7.8,4])
gs = plt.GridSpec(ncols=6, nrows=1, wspace=0.7, left=0.07, right=0.85, top=0.88, bottom=0.17)

te_ax = dict(zip(Te_order, range(6)))
sp_offset = dict(zip(species_order, [0,1,0]))
rep_offset = {'rep1':-0.2, 'rep2':0.2}
exp_plus = {'RNA-seq':'+', 'Ribo-seq':'x'}

for te, df in counts_norm.loc[counts_norm['gene'].isin(Te_order)].groupby('gene'):
    ax = fig.add_subplot(gs[te_ax[te]])
    
    xlabel = {}
    for (s, exp, rep), df1 in df.groupby(['species','exp','Rep']):
        c = species_color[s]
        co = sp_offset[s]
        em = exp_plus[exp]
        ro = rep_offset[rep]
        xlabel[co] = species_alias[s]
        
        
        ax.scatter(co+ro, df1['log_count_norm'].iloc[0], color=c, marker=em, s=48)
        
    for (s, rep), df1 in df.groupby(['species','Rep']):
        c = species_color[s]
        co = sp_offset[s]
        ro = rep_offset[rep]
        
        ax.plot(np.repeat(co+ro, 2), df1['log_count_norm'], lw=0.5, c=c)
    
    if te_ax[te] == 0:
        ax.set_ylabel('log2 normalized read counts')

    ax.set_xticks(range(2))
    ax.set_xticklabels([xlabel[0], xlabel[1]], rotation=90)
    ax.margins(0.2)
    ax.set_ylim(-0.2, 4.6)
    ax.text(0.5, 1.08, te_alias[te], ha='center', size=14, transform=ax.transAxes)
    
ax = fig.add_axes([0.85,0,0,1])
ax.axis('off')
legend_elms = [Line2D([0],[0], color='w', marker=exp_markers[exp], mec='k', ms=8, label=exp_alias[exp]) for exp in exp_alias]
ax.legend(handles=legend_elms, loc=6, bbox_to_anchor=(1,0.5), frameon=False)

#fig.text(0.02, 0.92, 'B', size=24, fontweight='bold')

sns.despine()
plt.savefig('/home/mathieu/mhenault_landrylab/md_ty_expression/fig/fig_final/Fig5B.jpg', dpi=300)
plt.show()
plt.close()

In [None]:
# volcano plots for the two contrasts of TE efficiency from deseq models - Fig 6

fig = plt.figure(figsize=[7.8,7])
gs = plt.GridSpec(ncols=3, nrows=3, height_ratios=[0,1,1], hspace=1, wspace=0.5, left=0.1, right=0.97, top=0.98, bottom=0.1)

for i, m in enumerate(metric_order):
    for (j, subg) in enumerate(['S288c', 'CBS432']):

        ax = fig.add_subplot(gs[j+1, i])
        df = DESEQ.loc[(DESEQ['metric']==m) &
                       (DESEQ['subgenome']==subg) &
                       (DESEQ['lfc']=='shrink_lfc') &
                       (DESEQ['pred']=='Experimentrpf.Specieshyb')]

        df1 = df.loc[~df['gene'].isin(Te_order)]
        ax.plot(df1['log2FoldChange'], df1['log_padj'], color='0.9', marker='o', ms=5, lw=0, zorder=0)

        for te, df1 in df.loc[df['gene'].isin(Te_order)].groupby('gene'):
            ax.scatter(df1['log2FoldChange'], df1['log_padj'], 
                       marker=te_marker[te], color=te_mfc[te], edgecolor='#07CB18',
                       s=48, zorder=1)

        for p in (0.05, 0.01):
            ax.axhline(-1*np.log10(p), ls='--', lw=1, color='0.5', zorder=0)
        ax.axvline(0, ls='--', lw=1, color='0.5', zorder=0)
        
        ax.set_title(f'{species_alias[subg_samples[subg][0]]} subgenome\n{metric_alias[m]}', size=14)
        ax.set_ylim(-0.2, 3.4)
        ax.set_xlim(-2,2)
        ax.set_ylabel('-log10 adjusted p-value')
        ax.set_xlabel('log2 fold change')
        
        #add inset
        ax_ins = ax.inset_axes((0.7,0.75,0.25,0.25))
        ax_ins.plot(df['log2FoldChange'], df['log_padj'], color='k', marker='o', ms=1, lw=0)
        # correct xtick labels
        ax_ins.set_xlim(-3.5, 3.5)
        ax_ins.set_ylim(-0.5, 8)
        ax_ins.set_xticks([-3, 3])
        ax_ins.set_yticks([0, 5])
        ax_ins.set(facecolor=(1,1,1,0))
        sns.despine(ax=ax_ins)
        
        #if i == 2:
        if subg == 'S288c':
            ax.set_xlabel('log2 fold change\n(RPF/mRNA : hyb/cer)')
        if subg == 'CBS432':
            ax.set_xlabel('log2 fold change\n(RPF/mRNA : hyb/par)')
        #ax.text(1.1, 0.5, pred_alias[pred], ha='left', va='center', size=14, rotation=90, transform=ax.transAxes)
        ax.margins(0.1)
                
        
ax = fig.add_subplot(gs[0, :])
ax.axis('off')
legend_elms = [Line2D([0], [0], c='w', marker=te_marker[te], mfc=te_mfc[te], mec='#07CB18', ms=8, label=te_alias[te])
               for te in ['S288c_Ty1T1', 'S288c_Ty2T1', 'CBS432_Ty3T1', 'CBS432_Ty5T1']]
L1 = plt.legend(handles=legend_elms, ncol=4, loc=9, bbox_to_anchor=(0.5, 1), frameon=False, columnspacing=1)
        
fig.text(0.02, 0.83, 'A', size=24, fontweight='bold')
fig.text(0.02, 0.39, 'B', size=24, fontweight='bold')
    
sns.despine()
plt.savefig('/home/mathieu/mhenault_landrylab/md_ty_expression/fig/fig_final/Fig6.jpg', dpi=300)
plt.show()
plt.close()

In [None]:
# volcano plots for the two contrasts of mRNA expression from deseq - Fig S2

fig = plt.figure(figsize=[7,4])
gs = plt.GridSpec(ncols=2, nrows=2, height_ratios=[0,1], hspace=0.7, wspace=0.5, left=0.1, right=0.96, top=0.98, bottom=0.18)

for i, subg in enumerate(['S288c','CBS432']):

    ax = fig.add_subplot(gs[1, i])
    df = DESEQ_RNA_SPLIT.loc[(DESEQ_RNA_SPLIT['subgenome']==subg)
                             & (DESEQ_RNA_SPLIT['lfc']=='shrink_lfc')
                            & (DESEQ_RNA_SPLIT['gene']!='S288c_Ty5T1')]
        
    df1 = df.loc[~df['gene'].isin(Te_order)]
    ax.plot(df1['log2FoldChange'], df1['log_padj'], color='0.9', marker='o', ms=5, lw=0, zorder=0)

    for te, df1 in df.loc[df['gene'].isin(Te_order)].groupby('gene'):
        ax.scatter(df1['log2FoldChange'], df1['log_padj'], 
                   marker=te_marker[te], color=te_mfc[te], edgecolor='#07CB18',
                   s=48, zorder=1)

    for p in (0.05, 0.01):
        ax.axhline(-1*np.log10(p), ls='--', lw=1, color='0.5', zorder=0)
    ax.axvline(0, ls='--', lw=1, color='0.5', zorder=0)

    #ax.set_title(metric_alias[m], size=14)
    ax.set_ylim(-0.2, 3.4)
    ax.set_xlim(-2,2)
    ax.set_ylabel('-log10 adjusted p-value')
    ax.set_title(f'{species_alias[subg_samples[subg][0]]} subgenome', size=14)
    
    if subg == 'S288c':
            ax.set_xlabel('log2 fold change\n(hyb/cer)')
    if subg == 'CBS432':
        ax.set_xlabel('log2 fold change\n(hyb/par)')

    #add inset
    ax_ins = ax.inset_axes((0.7,0.75,0.25,0.25))
    ax_ins.plot(df['log2FoldChange'], df['log_padj'], color='k', marker='o', ms=1, lw=0)
    # correct xtick labels
    ax_ins.set_xlim(-3.5, 3.5)
    ax_ins.set_ylim(-2, 40)
    ax_ins.set_xticks([-4, 4])
    ax_ins.set_yticks([0, 35])
    ax_ins.set(facecolor=(1,1,1,0))
    sns.despine(ax=ax_ins)

ax = fig.add_subplot(gs[0, :])
ax.axis('off')
legend_elms = [Line2D([0], [0], c='w', marker=te_marker[te], mfc=te_mfc[te], mec='#07CB18', ms=8, label=te_alias[te]) for te in Te_order]
L1 = plt.legend(handles=legend_elms, ncol=3, loc=9, bbox_to_anchor=(0.5, 1), frameon=False, columnspacing=1)
        
sns.despine()
plt.savefig('/home/mathieu/mhenault_landrylab/md_ty_expression/fig/fig_final/FigS3.jpg', dpi=300)
plt.show()
plt.close()