In [None]:
import os
from itertools import combinations
#import pickle
import numpy as np
import pandas as pd
import scipy
from ete3 import Tree
from intervaltree import IntervalTree, Interval
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
# suppress pandas warning
pd.options.mode.chained_assignment = None

In [None]:
# Plot template
import plotly.io as pio

pio.templates["custom"] = go.layout.Template(
    layout=go.Layout(
        font_size=20,
        font_family='Arial',
        xaxis_mirror=True, yaxis_mirror=True,
        xaxis_rangemode='tozero', yaxis_rangemode='tozero',
        height=500, width=800
    )
)

pio.templates.default = "simple_white+custom"
template_colors = pio.templates[pio.templates.default]['layout']['colorway']

In [None]:
figures_dir = '../fig'

## Load and prepare data

### Data paths

In [None]:
euk_tree_nwk = "../data/euk_species_tree.nwk"
euk_tree_phyla10_nwk = "../data/euk_species_tree_phyla10.nwk"
euk_tree_phyla10_anc_intron_ratio_nwk = '../data/euk_species_tree_phyla10_intron_ratio_anc.nwk'
euk_tree_phyla10_anc_intron_length_nwk = '../data/euk_species_tree_phyla10_intron_length_anc.nwk'
euk_tree_phyla10_anc_n_introns_nwk = '../data/euk_species_tree_phyla10_n_introns_anc.nwk'
snakemake_result_dir = "RESULT_deposit"
gene_structure_stats_tsv = os.path.join(snakemake_result_dir, 'all_species/intron_lengths.stats')
per_species_dir = os.path.join(snakemake_result_dir, "per_species")
all_species_dir = os.path.join(snakemake_result_dir, "all_species")

### Gene structure stats

In [None]:
gene_structure_stats_df = pd.read_csv(gene_structure_stats_tsv, sep='\t', index_col=0)
gene_structure_stats_df = gene_structure_stats_df.query('Dataset == "all"')

In [None]:
def binom_species(s):
    genus, species = s.split('_')[:2]
    return f'{genus}_{species}'

gene_structure_stats_df['species'] = gene_structure_stats_df.index.to_series().apply(binom_species)

In [None]:
gene_structure_stats_df['full_name'] = gene_structure_stats_df.index
gene_structure_stats_df.index = gene_structure_stats_df['species']

In [None]:
gene_structure_stats_df['log_Genome_size'] = np.log10(gene_structure_stats_df['Genome_size'])
gene_structure_stats_df['log_Mean'] = np.log10(gene_structure_stats_df['Mean'])
gene_structure_stats_df['log_Mean_total_exon_length_per_transcript'] = np.log10(gene_structure_stats_df['Mean_total_exon_length_per_transcript'])
gene_structure_stats_df['log_Mean_total_intron_length_per_transcript'] = np.log10(gene_structure_stats_df['Mean_total_intron_length_per_transcript'])
gene_structure_stats_df['log_Mean_intron_ratio'] = np.log10(gene_structure_stats_df['Mean_intron_ratio'])

In [None]:
phyla_counts = pd.DataFrame(gene_structure_stats_df['phylum'].value_counts())
phyla_counts.head(10)

In [None]:
# phyla with >= 10 species
phyla10 = phyla_counts.query('count >= 10').index.to_series()

In [None]:
gene_structure_stats_phyla10_df = gene_structure_stats_df.loc[gene_structure_stats_df['phylum'].isin(phyla10)]

In [None]:
phy_order = ['Chordata', 'Arthropoda', 'Mollusca', 'Cnidaria', 'Nematoda', 'Ascomycota', 'Streptophyta']
phy_colors = dict(zip(phy_order, template_colors))

In [None]:
# sort by phylum order
gene_structure_stats_phyla10_df['phylum'] = pd.Categorical(gene_structure_stats_phyla10_df['phylum'], phy_order)
gene_structure_stats_phyla10_df.sort_values(by='phylum', inplace=True)

In [None]:
# Create Supp table S1
table_s1 = gene_structure_stats_df[['species', 'full_name', 'group', 'phylum', 'class', 'Min', 'Max', 'Mean', 'STD', 'Q50', 'Transcripts_count', 'Transcripts_containing_introns', 'Mean_per_transcript',  'Mean_intron_ratio', 'Mean_total_intron_length_per_transcript', 'Mean_total_exon_length_per_transcript', 'Genome_size']]
table_s1.columns = ['species', 'species name in ENSEMBL', 'Taxonomic group', 'Phylum', 'Class', 'Min intron length', 'Max intron length', 'Mean intron length', 'Intron length SD', 'Median intron length', 'Number of transcripts', 'Number of transcripts containing introns', 'Mean number of introns per transcript',  'Mean intron ratio', 'Mean total intron length per transcript', 'Mean total exon length per transcript', 'Genome size']
table_s1.to_csv('../data/Table_S1.tsv', sep='\t', index=False)

### Phylogenetic tree

In [None]:
euk_tree = Tree(euk_tree_nwk)
euk_tree_phyla10 = Tree(euk_tree_phyla10_nwk)
euk_tree_phyla10_anc_intron_ratio = Tree(euk_tree_phyla10_anc_intron_ratio_nwk, format=1)
euk_tree_phyla10_anc_intron_length = Tree(euk_tree_phyla10_anc_intron_length_nwk, format=1)
euk_tree_phyla10_anc_n_introns = Tree(euk_tree_phyla10_anc_n_introns_nwk, format=1)

In [None]:
euk_tree_phyla10_species = [tip.name for tip in euk_tree_phyla10]
species_order_dict = {v: k for k, v in dict(enumerate(euk_tree_phyla10_species)).items()}

In [None]:
euk_tree_phyla10_counts = gene_structure_stats_phyla10_df.loc[euk_tree_phyla10_species,'phylum'].astype(str).value_counts(sort=False)

### KS distance matrices

In [None]:
def read_ks_distance_matrix(ks_distance_matrix_tsv):
    ks_distance_matrix = pd.read_csv(ks_distance_matrix_tsv, sep='\t', index_col=0)
    # convert full species names to binomial names
    ks_distance_matrix.index = ks_distance_matrix.index.to_series().apply(binom_species)
    ks_distance_matrix.columns = ks_distance_matrix.columns.to_series().apply(binom_species)
    
    # remove species not in the phylogeny (having NaN index or column)
    ks_distance_matrix = ks_distance_matrix.loc[euk_tree_phyla10_species,euk_tree_phyla10_species]
    
    # sort by order on phylogeny
    ks_distance_matrix.sort_index(key=lambda s: s.map(species_order_dict), inplace=True)
    ks_distance_matrix = ks_distance_matrix[sorted(ks_distance_matrix.columns, key=lambda s: species_order_dict[s])]

    return ks_distance_matrix

In [None]:
intron_ratio_ks_distance_matrix_tsv = os.path.join(all_species_dir, 'intron_ratios_KS_dist.tsv')
intron_ratio_ks_distance_matrix = read_ks_distance_matrix(intron_ratio_ks_distance_matrix_tsv)
intron_len_ks_distance_matrix_tsv = os.path.join(all_species_dir, 'intron_lengths_KS_dist.tsv')
intron_len_ks_distance_matrix = read_ks_distance_matrix(intron_len_ks_distance_matrix_tsv)
intron_count_ks_distance_matrix_tsv = os.path.join(all_species_dir, 'intron_counts_KS_dist.tsv')
intron_count_ks_distance_matrix = read_ks_distance_matrix(intron_count_ks_distance_matrix_tsv)

## Representative species
Gene structure feature distributions for species representing each phylum

In [None]:
# list of representative species, one from each phylum
represent_spec = ['mus_musculus', 'drosophila_melanogaster', 'stylophora_pistillata',
                 'solanum_lycopersicum', 'biomphalaria_glabrata', 'caenorhabditis_elegans',
                 'tuber_melanosporum']
# sort by phylum order
represent_spec = sorted(represent_spec, key=lambda sp: phy_order.index(gene_structure_stats_df.loc[sp]['phylum']))
represent_spec_full = [gene_structure_stats_df.loc[sp]['full_name'] for sp in represent_spec]

In [None]:
def attributes_to_dict(attr):
  return {a.split('=')[0]: a.split('=')[1] for a in attr.split(';')}

def attributes_to_column(row, attr):
  attr_dict = attributes_to_dict(row['attributes'])
  if attr in attr_dict:
    return attr_dict[attr]
  else:
    return None

def gff_to_df(gff, type=None):
  headers = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes']
  gff_df = pd.read_csv(gff, sep='\t', names=headers, comment='#')
  if type:
    gff_df = gff_df.query('type == @type')
  gff_df['ID'] = gff_df.apply(attributes_to_column, axis=1, args=('ID',))
  gff_df['Parent'] = gff_df.apply(attributes_to_column, axis=1, args=('Parent',))
  gff_df['length'] = gff_df['end'] - gff_df['start'] + 1
  return gff_df

def get_stats(gff_df):
  introns_df = gff_df.query('type == "intron"')
  trans_df = gff_df.query('type == "mRNA"')
  exon_df = gff_df.query('type == "exon"')

  # intron lengths
  intron_lengths = introns_df['length']

  # intron counts
  trans_with_introns = introns_df['Parent'].nunique()
  introns_per_trans = introns_df.groupby('Parent')['type'].count()
  single_exon_trans = set(trans_df['ID']) - set(introns_per_trans.index)
  single_exon_trans_introns = pd.Series(0, index=single_exon_trans)
  introns_per_trans = pd.concat([introns_per_trans, single_exon_trans_introns])
    
  # intron fractions
  intron_len_per_trans = introns_df.groupby('Parent')['length'].sum()
  single_exon_trans_introns_len = pd.Series(0, index=single_exon_trans)
  intron_len_per_trans = pd.concat([intron_len_per_trans, single_exon_trans_introns_len])
  intron_len_per_trans.name = 'length'
  trans_len = trans_df['length']
  trans_len.index = trans_df['ID']
  trans_len.name = 'transcript_length'
  intron_len_per_trans = pd.concat([intron_len_per_trans, trans_len], axis=1)
  intron_len_per_trans['intron_fraction'] = intron_len_per_trans['length']/intron_len_per_trans['transcript_length']
  intron_len_per_trans['exons_length'] = intron_len_per_trans['transcript_length'] - intron_len_per_trans['length']
  intron_len_per_trans['intron_ratio'] = intron_len_per_trans['length']/intron_len_per_trans['exons_length']

  return np.log10(intron_lengths), introns_per_trans, intron_len_per_trans['intron_ratio']

In [None]:
stats = {}
for sp_full, sp in zip(represent_spec_full[::-1], represent_spec[::-1]):
    gff = os.path.join(per_species_dir, sp_full, 'annotation.canon.introns.gff3')
    gff_df = gff_to_df(gff)
    stats[sp] = get_stats(gff_df)

In [None]:
# generate intron ratio distributions for representative species
fig = make_subplots(rows=1, cols=3,
                    shared_yaxes=True,
                   horizontal_spacing=0.02,
                   subplot_titles=['<b>a</b>','<b>b</b>','<b>c</b>'])
i = 0
max_len = 0
for sp_full, sp in zip(represent_spec_full[::-1], represent_spec[::-1]):
    intron_lengths, introns_per_trans, intron_ratios = stats[sp]
    
    phy = gene_structure_stats_df.loc[sp]['phylum']
    sp_name = sp.replace('_',' ')
    sp_name = f"<i>{sp_name[0].upper()}{sp_name[1:]}</i> ({phy})"
    fig.add_trace(go.Violin(x=intron_ratios.loc[intron_ratios <= 50], name=sp_name, line_color=template_colors[6-i]), row=1, col=1)
    fig.add_trace(go.Violin(x=intron_lengths, name=sp_name, line_color=template_colors[6-i]), row=1, col=2)
    fig.add_trace(go.Violin(x=introns_per_trans.loc[introns_per_trans <= 50], name=sp_name, line_color=template_colors[6-i]), row=1, col=3)
    i += 1

fig.update_traces(orientation='h', side='positive', width=3, points=False)
fig.update_layout(xaxis_showgrid=False, xaxis_zeroline=False, height=600, width=1000,
                  showlegend=False,
                )

fig['layout']['xaxis']['title'] = 'Intron ratio'
fig['layout']['xaxis2']['title'] = 'Intron length (log<sub>10</sub> bp)'
fig['layout']['xaxis3']['title'] = 'Number of introns'
fig.update_yaxes(tickfont={'size': 14}, ticks='')
fig.update_xaxes(tickfont={'size': 14}, titlefont={'size': 16})
fig.update_annotations(font_size=18, y=1.05)   # subplot titles

fig.show()    

In [None]:
fig_name = 'fig_1'
fig_path = os.path.join(figures_dir, fig_name + '.jpg')
fig.write_image(fig_path, scale=5)
fig_path = os.path.join(figures_dir, fig_name + '.pdf')
fig.write_image(fig_path, scale=5)

## Gene structure feature means
Per-species means of various features for whole genomes.

In [None]:
fig = make_subplots(rows=2, cols=6,
                    shared_yaxes=True,
                   horizontal_spacing=0.02, vertical_spacing=0.25,
                    specs=[
        [{'colspan': 2}, None, {'colspan': 2}, None, {'colspan': 2},None],
        [None, {'colspan': 2}, None, {'colspan': 2}, None, None]
          ],
                    subplot_titles=['<b>a</b>','<b>b</b>','<b>c</b>','<b>d</b>','<b>e</b>']
                   )
                   


def strip_trace(data, color, name):
    return go.Box(x=data, name=name, boxpoints = 'all', pointpos = 0, marker = dict(color=color, size=3), line = dict(color= 'rgba(0,0,0,0)'), fillcolor='rgba(0,0,0,0)')

features = ['log_Mean_intron_ratio', 'log_Mean_total_exon_length_per_transcript', 'log_Mean_total_intron_length_per_transcript',
           'log_Mean', 'Mean_per_transcript']

r = 1; c = 1
for feat in features:
    for phyl in phy_order[::-1]:
        phyl_data = gene_structure_stats_phyla10_df.query('phylum == @phyl')
        fig.add_trace(strip_trace(phyl_data[feat], phy_colors[phyl], phyl), row=r, col=c)
    c += 2
    if c > 6:
        c = 2
        r += 1

fig.update_layout(height=600, width=1000, showlegend=False)
fig['layout']['xaxis']['title'] = 'Intron ratio (log<sub>10</sub>)'
fig['layout']['xaxis2']['title'] = 'Total exon length (log<sub>10</sub> bp)'
fig['layout']['xaxis3']['title'] = 'Total intron length (log<sub>10</sub> bp)'
fig['layout']['xaxis4']['title'] = 'Intron length (log<sub>10</sub> bp)'
fig['layout']['xaxis5']['title'] = 'Number of introns'
fig.update_yaxes(tickfont={'size': 14}, ticks='')
fig.update_xaxes(tickfont={'size': 14}, titlefont={'size': 14})
fig.update_annotations(font_size=18)   # subplot titles
fig.for_each_annotation(lambda a: a.update(y=a.y+0.03))

fig.show()

In [None]:
fig_name = 'fig_2'
fig_path = os.path.join(figures_dir, fig_name + '.jpg')
fig.write_image(fig_path, scale=5)
fig_path = os.path.join(figures_dir, fig_name + '.pdf')
fig.write_image(fig_path, scale=5)

## Phylogenetic conservation
Based on pairwise similarity between gene structure feature distributions, using the KS statistic as a measure.

In [None]:
def plot_ks_heatmap_with_tree_dist(ks_distance_matrix, title=''):      

    # add tree distance below the diagonal
    tree_distance_matrix = ks_distance_matrix.copy()
    ks_distance_matrix_copy = ks_distance_matrix.copy()
    for comb in combinations(ks_distance_matrix_copy.index, 2):
        sp2, sp1 = comb
        ks_distance_matrix_copy.loc[sp1,sp2] = np.nan
        tree_distance_matrix.loc[sp2,sp1] = np.nan
        tree_distance_matrix.loc[sp1,sp2] = euk_tree_phyla10.get_distance(sp1, sp2)/2
    for sp in ks_distance_matrix_copy.index:
        ks_distance_matrix_copy.loc[sp,sp] = np.nan
        tree_distance_matrix.loc[sp,sp] = np.nan

    # create figure
    fig = px.imshow(ks_distance_matrix_copy, color_continuous_scale='RdBu_r', height=600, width=800)
    fig.add_heatmap(z=tree_distance_matrix, colorscale='Greys', reversescale=True, zmin=0, zmax=1800)
    fig.update_xaxes(showticklabels=False, tickmode="array", tickvals=[], title_text='', visible=True)
    fig.update_yaxes(showticklabels=False, tickmode="array", tickvals=[], title_text='', visible=True)
    fig.update_layout(coloraxis_colorbar_x=1, coloraxis_colorbar_title='KS<br>distance', legend_x=1.3)
    fig.data[1].colorbar = dict(title='Divergence<br>time (Myr)', orientation='h', y=-0.15, x=0.45, len=1.15)

    # add phyla color bars in figure margins
    i = 0
    for phy, c in zip(euk_tree_phyla10_counts.index, euk_tree_phyla10_counts):
        fig.add_shape(type="line",
            x0=-15, y0=i, x1=-15, y1=i+c,
                      line_color=phy_colors[phy],
                      opacity=1, line_width=7
        )
        fig.add_shape(type="line",
            x0=i, y0=-15, x1=i+c, y1=-15,
                      line_color=phy_colors[phy],
                      opacity=1, line_width=7
        )    
        i += c

    # add phyla legend
    for phy in phy_order:
        fig.add_scatter(x=[np.nan], y=[np.nan], mode='lines', name=phy, line_width=5, line_color=phy_colors[phy])
        
    fig.update_layout(title=title)
    return fig

In [None]:
fig = plot_ks_heatmap_with_tree_dist(intron_ratio_ks_distance_matrix)
fig.show()

In [None]:
fig_name = 'fig_4a'
fig_path = os.path.join(figures_dir, fig_name + '.jpg')
fig.write_image(fig_path, scale=5)
fig_path = os.path.join(figures_dir, fig_name + '.pdf')
fig.write_image(fig_path, scale=5)

In [None]:
fig = plot_ks_heatmap_with_tree_dist(intron_len_ks_distance_matrix)
fig.show()

In [None]:
fig_name = 'fig_S3a'
fig_path = os.path.join(figures_dir, fig_name + '.jpg')
fig.write_image(fig_path, scale=5)
fig_path = os.path.join(figures_dir, fig_name + '.pdf')
fig.write_image(fig_path, scale=5)

In [None]:
fig = plot_ks_heatmap_with_tree_dist(intron_count_ks_distance_matrix)
fig.show()

In [None]:
fig_name = 'fig_S4a'
fig_path = os.path.join(figures_dir, fig_name + '.jpg')
fig.write_image(fig_path, scale=5)
fig_path = os.path.join(figures_dir, fig_name + '.pdf')
fig.write_image(fig_path, scale=5)

## Evolutionary trajectories

In [None]:
def lin_eq(x1,y1,x2,y2):
    m = (y2-y1)/(x2-x1)
    b = y1 - m*x1
    return lambda x: m*x+b

def tree_to_intervals(tree, column, groups={}):   
    
    branches_iv = IntervalTree()

    # find MRCAs of groups
    group_mrcas = {}
    for gr in groups:
        group_mrcas[gr] = tree.get_common_ancestor(groups[gr])
        
    for node in tree.traverse():
        if node.is_root():
            continue
        parent = node.up
        node_height = node.get_distance(tree)
        parent_height = parent.get_distance(tree)
        if node_height == parent_height:
            continue
        if node.is_leaf():
            sp = node.name
            node_val = gene_structure_stats_phyla10_df.query('species == @sp')[column].iloc[0]
        else:
            node_val = float(node.name)
        parent_val = float(parent.name)
        branch_lin_eq = lin_eq(parent_height,parent_val,node_height,node_val)

        # assign branch to group
        #branch_group = 'NA'
        branch_group = f'{parent_height}-{node_height}'
        for gr in group_mrcas:
            if parent == group_mrcas[gr] or parent in group_mrcas[gr]:
                branch_group = gr
                break
        
        branches_iv[parent_height:node_height] = {'group': branch_group, 'model': branch_lin_eq}
    return branches_iv

In [None]:
def median_at_point(branches_iv, h):
    branches = branches_iv[h]
    values = {}
    for b in branches:
        br_group = b.data['group']
        if br_group not in values:
            values[br_group] = []
        values[br_group].append(b.data['model'](h))
        
    for br_group in values:
        values[br_group] = pd.Series(values[br_group]).median()
        
    return values

In [None]:
phyla10_species = gene_structure_stats_phyla10_df.groupby(by='phylum', observed=True)['species'].apply(list).to_dict()
for phyl in phyla10_species:
    phyla10_species[phyl] = [s for s in phyla10_species[phyl] if s in euk_tree_phyla10]

In [None]:
tree_iv_intron_ratio = tree_to_intervals(euk_tree_phyla10_anc_intron_ratio, 'log_Mean_intron_ratio', phyla10_species)

In [None]:
res = []
for h in np.arange(0,1600,5):
    h_values = pd.DataFrame.from_dict(median_at_point(tree_iv_intron_ratio, h), orient='index').reset_index()
    h_values.columns = ['group','mean_at_height']
    h_values['height'] = h
    res.append(h_values)
res = pd.concat(res)

In [None]:
fig = px.line(res, x='height', y='mean_at_height', color='group')
for trace in fig.data:
    if trace.name in phy_colors:
        trace.line['color'] = phy_colors[trace.name]
    else:
        trace.line['color'] = 'black'
        trace.showlegend = False
fig.update_xaxes(title='Time since LECA (Myr)')
fig.update_yaxes(title='Intron ratio (log<sub>10</sub>)')
fig.update_layout(legend_title_text='', plot_bgcolor = "rgba(0,0,0,0)", paper_bgcolor = "rgba(0,0,0,0)")
        
fig.show()

In [None]:
fig_name = 'fig_3b'
fig_path = os.path.join(figures_dir, fig_name + '.jpg')
fig.write_image(fig_path, scale=5)
fig_path = os.path.join(figures_dir, fig_name + '.pdf')
fig.write_image(fig_path, scale=5)

In [None]:
tree_iv_intron_length = tree_to_intervals(euk_tree_phyla10_anc_intron_length, 'log_Mean', phyla10_species)

res = []
for h in np.arange(0,1600,5):
    h_values = pd.DataFrame.from_dict(median_at_point(tree_iv_intron_length, h), orient='index').reset_index()
    h_values.columns = ['group','mean_at_height']
    h_values['height'] = h
    res.append(h_values)
res = pd.concat(res)

fig = px.line(res, x='height', y='mean_at_height', color='group')
for trace in fig.data:
    if trace.name in phy_colors:
        trace.line['color'] = phy_colors[trace.name]
    else:
        trace.line['color'] = 'black'
        trace.showlegend = False
fig.update_xaxes(title='Time since LECA (Myr)')
fig.update_yaxes(title='Intron length (log<sub>10</sub> bp)')
fig.update_layout(legend_title_text='', plot_bgcolor = "rgba(0,0,0,0)", paper_bgcolor = "rgba(0,0,0,0)")
        
fig.show()

In [None]:
fig_name = 'fig_S1b'
fig_path = os.path.join(figures_dir, fig_name + '.jpg')
fig.write_image(fig_path, scale=5)
fig_path = os.path.join(figures_dir, fig_name + '.pdf')
fig.write_image(fig_path, scale=5)

In [None]:
tree_iv_n_introns = tree_to_intervals(euk_tree_phyla10_anc_n_introns, 'Mean_per_transcript', phyla10_species)

res = []
for h in np.arange(0,1600,5):
    h_values = pd.DataFrame.from_dict(median_at_point(tree_iv_n_introns, h), orient='index').reset_index()
    h_values.columns = ['group','mean_at_height']
    h_values['height'] = h
    res.append(h_values)
res = pd.concat(res)

fig = px.line(res, x='height', y='mean_at_height', color='group')
for trace in fig.data:
    if trace.name in phy_colors:
        trace.line['color'] = phy_colors[trace.name]
    else:
        trace.line['color'] = 'black'
        trace.showlegend = False
fig.update_xaxes(title='Time since LECA (Myr)')
fig.update_yaxes(title='Number of introns')
fig.update_layout(legend_title_text='', plot_bgcolor = "rgba(0,0,0,0)", paper_bgcolor = "rgba(0,0,0,0)")
        
fig.show()

In [None]:
fig_name = 'fig_S2b'
fig_path = os.path.join(figures_dir, fig_name + '.jpg')
fig.write_image(fig_path, scale=5)
fig_path = os.path.join(figures_dir, fig_name + '.pdf')
fig.write_image(fig_path, scale=5)

## Intron length and Genome size

In [None]:
pgls_tsv = 'genome_size_intron_length_PGLS.tsv'
pgls_df = pd.read_csv(pgls_tsv, sep='\t', index_col=0)

In [None]:
pgls_df

In [None]:
def abline(intercept, slope, data, color):
    x_min = data.min()-0.5
    x_max = data.max()+0.5
    x = [x_min, x_max]
    y = [intercept + p*slope for p in x]
    return go.Scatter(x=x, y=y, mode='lines', line_color=color, line_width=3, showlegend=False)

In [None]:
pgls_intron_len_df = pgls_df.iloc[:,0:5]
pgls_intron_len_df

In [None]:
fig = px.scatter(gene_structure_stats_phyla10_df, x='log_Genome_size', y='log_Mean', color='phylum', color_discrete_map=phy_colors,
                height=500, width=800)
fig.update_xaxes(range=(6.5,10.5), title='Genome size (log<sub>10</sub> bp)')
fig.update_yaxes(title='Mean intron length (log<sub>10</sub> bp)')

# add PGLS model lines
for row in pgls_df.iterrows():
    if row[1]['p slope log_Mean'] > 0.05:
        continue
    slope = row[1]['slope log_Mean']
    if row[1]['p intercept log_Mean'] > 0.05:
        intercept = 0
    else:
        intercept = row[1]['intercept log_Mean']
    phylum = row[0]
    color = phy_colors[phylum]
    phyl_data = gene_structure_stats_phyla10_df.query('phylum == @phylum')['log_Genome_size']
    fig.add_trace(abline(intercept, slope, phyl_data, color))

fig.show()

In [None]:
fig_name = 'fig_6'
fig_path = os.path.join(figures_dir, fig_name + '.jpg')
fig.write_image(fig_path, scale=5)
fig_path = os.path.join(figures_dir, fig_name + '.pdf')
fig.write_image(fig_path, scale=5)

In [None]:
pgls_intron_ratio_df = pgls_df.iloc[:,10:]
pgls_intron_ratio_df

In [None]:
fig = px.scatter(gene_structure_stats_phyla10_df, x='log_Genome_size', y='log_Mean_intron_ratio', color='phylum', color_discrete_map=phy_colors,
                height=500, width=800)
fig.update_xaxes(range=(6.5,10.5), title='log<sub>10</sub> Genome size (bp)')
fig.update_yaxes(title='log<sub>10</sub> Mean intron ratio')

# add PGLS model lines
for row in pgls_df.iterrows():
    if row[1]['p slope Mean_intron_ratio_log'] > 0.05:
        continue
    slope = row[1]['slope Mean_intron_ratio_log']
    if row[1]['p intercept Mean_intron_ratio_log'] > 0.05:
        intercept = 0
    else:
        intercept = row[1]['intercept Mean_intron_ratio_log']
    phylum = row[0]
    color = phy_colors[phylum]
    phyl_data = gene_structure_stats_phyla10_df.query('phylum == @phylum')['log_Genome_size']
    fig.add_trace(abline(intercept, slope, phyl_data, color))

fig.show()

In [None]:
fig_name = 'fig_S5'
fig_path = os.path.join(figures_dir, fig_name + '.jpg')
fig.write_image(fig_path, scale=5)
fig_path = os.path.join(figures_dir, fig_name + '.pdf')
fig.write_image(fig_path, scale=5)

## BUSCO analysis
Analyzing orthology groups based on BUSCOs

In [None]:
classes = ['Mammalia', 'Aves', 'Lepidosauria', 'Actinopteri', 'Insecta', 'Arachnida', 'Sordariomycetes', 'Magnoliopsida']

### Load evolutionary change data

In [None]:
def load_change_tsv(class_name):
    change_tsv = os.path.join(all_species_dir, f'{class_name}_change.tsv')
    change_df = pd.read_csv(change_tsv, sep='\t', index_col=0).dropna()
    change_df['Total_sequence_change_norm'] = change_df['Total_sequence_change']/change_df['Total_tree_length']
    change_df['Total_gene_structure_change_norm'] = change_df['Total_gene_structure_change']/change_df['Total_tree_length']
    change_df['Total_sequence_change_norm_bin'] = pd.cut(change_df['Total_sequence_change_norm'], bins=10, labels=False)
    change_df['Total_sequence_change_norm_rank'] = change_df['Total_sequence_change_norm'].rank()
    change_df['Total_gene_structure_change_norm_rank'] = change_df['Total_gene_structure_change_norm'].rank()
    return change_df

In [None]:
class_change_data = {class_name: load_change_tsv(class_name) for class_name in classes}

### Correlation between sequence and gene structure evolution
Per class

In [None]:
def mad(s):
    s = s.dropna()
    return scipy.stats.median_abs_deviation(s.values)

def mad_outliers(s, x=3):
    smed = s.median()
    smad = mad(s)
    lower = smed - x*smad
    upper = smed + x*smad
    return (s < lower) | (s > upper)

def filter_mad_outliers(group_df, column):
    s = group_df[column]
    mad_outliers_filter = mad_outliers(s, x=10)
    return group_df.loc[~mad_outliers_filter]

In [None]:
colors = px.colors.qualitative.Plotly
fig = go.Figure()
c = 0
for class_name, class_change_df in class_change_data.items():
    # divide sequence conservation into 10 bins
    class_change_df['Total_sequence_change_norm_rank_bin'] = pd.cut(class_change_df['Total_sequence_change_norm_rank'], bins=10, labels=False)
    class_change_df['Total_sequence_change_norm_rank_bin'] += 1
    # remove MAD outliers per bin
    class_change_df = class_change_df.groupby('Total_sequence_change_norm_rank_bin').apply(filter_mad_outliers, column = 'Total_gene_structure_change_norm')
    class_change_df.index = class_change_df.index.droplevel()
    # calculate mean and STE per bin
    bin_means = class_change_df[['Total_gene_structure_change_norm', 'Total_sequence_change_norm_rank_bin']].groupby('Total_sequence_change_norm_rank_bin').agg(
    mean=('Total_gene_structure_change_norm', 'mean'),
    stderr=('Total_gene_structure_change_norm', 'sem')
)
    bin_means['bin'] = bin_means.index
    
    class_color = colors[c]
    c += 1
    fig.add_scatter(x=bin_means['bin'], y=bin_means['mean'],
                error_y={'type':'data', 'array':bin_means['stderr'], 'visible':True},
                    name=class_name, marker_color=class_color
                )
fig.update_xaxes(title="Protein Sequence Evolutionary Rate", range=(0.5,10.5), tickvals=list(range(1,11)))
fig.update_yaxes(title="Gene structure<br>Evolutionary Rate")
fig.update_layout(height=500)
fig.show()

In [None]:
fig_name = 'fig_5a'
fig_path = os.path.join(figures_dir, fig_name + '.jpg')
fig.write_image(fig_path, scale=5)
fig_path = os.path.join(figures_dir, fig_name + '.pdf')
fig.write_image(fig_path, scale=5)

### Correlation between conservation ranks between classes
For all class pairs, compute the correlations of sequence and gene structure conservation ranks

In [None]:
class_pair_corr_df = pd.DataFrame(np.nan, index=classes, columns=classes)

In [None]:
class_pairs = list(combinations(classes,2))
for class_pair in class_pairs:
    class1 = class_pair[0]
    class2 = class_pair[1]
    # gene structure
    class1_score = class_change_data[class1]['Total_gene_structure_change_norm']
    class2_score = class_change_data[class2]['Total_gene_structure_change_norm']
    score_df = pd.concat([class1_score, class2_score], axis=1, join='inner')
    rank_df = score_df.apply(lambda c: c.rank())
    x = rank_df.iloc[:,0]
    y = rank_df.iloc[:,1]
    r2 = round(scipy.stats.pearsonr(x,y).statistic**2,2)
    class_pair_corr_df.loc[class1, class2] = r2
    # sequence
    class1_score = class_change_data[class1]['Total_sequence_change_norm']
    class2_score = class_change_data[class2]['Total_sequence_change_norm']
    score_df = pd.concat([class1_score, class2_score], axis=1, join='inner')
    rank_df = score_df.apply(lambda c: c.rank())
    x = rank_df.iloc[:,0]
    y = rank_df.iloc[:,1]
    r2 = round(scipy.stats.pearsonr(x,y).statistic**2,2)
    class_pair_corr_df.loc[class2, class1] = r2

In [None]:
# Under diagonal - sequence; above diagonal - gene structure

fig = px.imshow(class_pair_corr_df, text_auto=True, height=600, width=600)

fig.update_layout(shapes=[
    {'type': 'path', 'path': 'M -0.7 -0.7 L 7.7 7.7 L -0.7 7.7 Z', 'fillcolor': 'rgba(0,0,0,0)', 'line_color': 'red', 'line_width': 3, 'opacity': 0.6},
    {'type': 'path', 'path': 'M -0.7 -0.7 L 7.7 -0.7 L 7.7 7.7 Z', 'fillcolor': 'rgba(0,0,0,0)', 'line_color': 'orange', 'line_width': 3, 'opacity': 0.6},
    {'type': 'path', 'path': 'M -0.7 -0.7 L 7.7 7.7 L', 'line_color': 'black', 'line_width': 3, 'opacity': 1}
])
fig.update_layout(coloraxis_colorbar=dict(title="R<sup>2</sup>"))
fig.show()

In [None]:
fig_name = 'fig_5b'
fig_path = os.path.join(figures_dir, fig_name + '.jpg')
fig.write_image(fig_path, scale=5)
fig_path = os.path.join(figures_dir, fig_name + '.pdf')
fig.write_image(fig_path, scale=5)

In [None]:
table_s6_list = []
for class_name, class_data in class_change_data.items():
    class_data['Class'] = class_name
    table_s6_list.append(class_data)
table_s6 = pd.concat(table_s6_list)

In [None]:
table_s6 = table_s6[['Class', 'Total_tree_length', 'Total_sequence_change',
       'Total_gene_structure_change', 'Total_sequence_change_norm',
       'Total_gene_structure_change_norm']]
table_s6.index.name = 'BUSCO ID'

In [None]:
table_s6.to_csv('../data/BUSCO_change_stats.tsv', sep='\t')