In [1]:
# Import packages 
import pandas as pd 
import json 
from scipy import stats
import matplotlib as mpl

# 1 Import RNA and Riboseq data 

This script was used to convert the RNAseq and Riboseq data (from Sjannie Lefevre) to a format, which could be used to plot the four conditions and two methods in a plot (for each transcript_id). The plots show normalized data (normalization was performed to the normoxia group). The conversion was performed with pandas, the plots were created with seaborn. 


In [2]:
# IMPORT DATAFRAMES
# TMM data 
rna  = pd.read_csv('/home/maggy/WholeGenomeBisulphiteSequencing/Riboseq/1_TMM_input_data/rna_genes_featurecounts_edgeR_TMM.matrix', sep='\t', header=0)
ribo = pd.read_csv('/home/maggy/WholeGenomeBisulphiteSequencing/Riboseq/1_TMM_input_data/rpf_genes_featurecounts_edgeR_TMM.matrix', sep='\t', header=0)

# 2 Concat the RNA and the Ribo dataframe 

In [3]:
# concat the two dataframes into one:
# first: split the column names to be the same in both dataframes
rna.columns = [c.split('seq_')[-1] for c in rna.columns]
ribo.columns = [c.split('seq_')[-1] for c in ribo.columns]
# second: add a 'kind' column to each dataframe, so that the data stay identifiable
rna = rna.assign(kind='rna')
ribo = ribo.assign(kind='ribo')
# third: bring the dataframes together in one new dataframe
df = pd.concat([ribo, rna], sort=True)

# drop rows that contain 'NaN' in the one column
df.dropna(axis=0).head(3)


Unnamed: 0,24hR50,24hR51,24hR53,24hR56,24hR59,24hR64,3hR37,3hR38,3hR40,3hR42,...,N24,N25,N26,N27,N28,N29,gene_id,kind,product,symbol
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.096377,ccar_ua01-g10,ribo,Complement C1q-like protein 4 (C1q and tumor n...,C1QL4_MOUSE
3,4.317758,2.850874,5.467047,3.691205,2.243332,4.232491,1.427969,3.719796,1.891722,2.032647,...,0.844708,1.242982,1.660518,1.931696,1.620286,1.734783,ccar_ua01-g1000,ribo,Apoptosis regulator BAX,BAX_BOVIN
4,72.207608,140.548092,101.88587,107.967751,103.69178,130.822441,98.351358,63.236536,88.354567,75.462014,...,183.905104,156.71927,114.575752,77.267851,110.873841,103.605108,ccar_ua01-g1001,ribo,Small ribosomal subunit protein uS17 (40S ribo...,RS11_RAT


# 3 Import the DMR geneid and filter the big df 

In [4]:
# Import the DMRs
with open(f'/home/maggy/WholeGenomeBisulphiteSequencing/GO_analysis/1_prepare_data_for_GO_analysis/dmr_unique_geneids.json', 'r') as file:
    dmr_unique_geneids = json.load(file)

# filter df with the DMR dict to create 3 subdataframes 
tmm = {}
for name,geneids in dmr_unique_geneids.items():
     tmm[name]= df.loc[df['gene_id'].isin(geneids)]


In [5]:
# define a function that transforms the sample names to coherent condition names
def sample_to_condition(sample: str) -> str:
    """Transform individual sample name to coherent condition name."""
    if sample.startswith('N'):
        return 'normoxia'
    elif sample.startswith('A'):
        return 'anoxia'
    elif sample.startswith('3hR'):
        return '3h_reoxygenation'
    elif sample.startswith('24hR'):
        return '24h_reoxygenation'
    else: 
        raise ValueError(f"Unknown sample name {sample}")
    

# drop the unnecessary columns 
# melt the dataframe (from wide to long format)
# apply the function to the sample column to create a new column 'condition'
for name, df in tmm.items(): 
    df = df.drop(columns=['product', 'symbol'])
    df = df.melt(id_vars=['gene_id', 'kind'], value_name='measurement', var_name='sample')
    df['condition'] = df['sample'].apply(sample_to_condition)
    tmm[name] = df

In [9]:
tmm['NvsA']

Unnamed: 0,gene_id,kind,sample,measurement,condition
0,ccar_ua02-g2350,ribo,24hR50,3.766555,24h_reoxygenation
1,ccar_ua05-g5816,ribo,24hR50,48.781476,24h_reoxygenation
2,ccar_ua10-g10025,ribo,24hR50,4.225891,24h_reoxygenation
3,ccar_ua15-g14205,ribo,24hR50,0.367469,24h_reoxygenation
4,ccar_ua15-g14343,ribo,24hR50,9.370453,24h_reoxygenation
...,...,...,...,...,...
667,ccar_ub11-g34027,rna,N29,4.170469,normoxia
668,ccar_ub12-g34297,rna,N29,7.745157,normoxia
669,ccar_ub15-g36855,rna,N29,18.111753,normoxia
670,ccar_ub22-g43219,rna,N29,20.733191,normoxia


# 4 Normalization of data 

In [13]:
for name, df in tmm.items(): 
    # group the df 
    grouped = df.groupby(['gene_id', 'kind', 'condition'])
    for (gene_id, kind, condition), data in grouped: 
        # calculate the mean for normoxia
        mean = data.loc[data.condition == 'normoxia', 'measurement']
        if len(mean) == 0: 
            continue
        mean = mean.mean() 
        # calculate the geomean in normoxia 
        geomean = stats.gmean(data.loc[data.condition == 'normoxia', 'measurement'])
        # first the data with the same kind were selected
        kin = df['kind'] == kind
        # second the data with the same transcript id were selected
        gene = df['gene_id'] == gene_id
        # only if both (kind= 1; transcript= 1 --> 1+1 = 2) were True the norm average was appended in a new column
        df.loc[kin.astype(int) + gene.astype(int) == 2, "norm_average"] = mean
        # Determination of relative deviation
        df.loc[kin.astype(int) + gene.astype(int) == 2, "rel_deviation"] = df.loc[kin.astype(
            int) + gene.astype(int) == 2, "measurement"]/mean

# 5 PLOT 

In [14]:
import seaborn as sns 
import matplotlib as mpl
import matplotlib.pyplot as plt

# set sns colorpallete to colorblind 
sns.set_palette('colorblind')

# RELATIVE FOLD CHANGE PLOT 

for name, df in tmm.items():
    grouped = df.groupby(['gene_id'])
    for gene_id, data in grouped:
        g = sns.catplot(data = data,  order=['rna', 'ribo'], x='kind', y='rel_deviation', hue='condition', hue_order=['normoxia', 'anoxia', '3h_reoxygenation', '24h_reoxygenation'], kind='bar', legend=False)
        sns.swarmplot(data = data, x='kind',  order=['rna', 'ribo'], y='rel_deviation', hue='condition', hue_order=['normoxia', 'anoxia', '3h_reoxygenation', '24h_reoxygenation'], dodge=True, legend=False, linewidth=0.7, edgecolor="k", ax=g.ax, alpha=0.7)
        ticks = g.ax.get_xticks()
        plt.xticks(ticks, ["RNA", "RFP"])
        plt.legend()
        plt.ylabel("Normalized Expression")
        plt.xlabel("Method")
        sns.move_legend(g.ax, bbox_to_anchor=(1, 1),loc="upper left", title="Conditions:")
        plt.title(gene_id, fontdict={'weight': 'bold'})
        plt.tight_layout()
        plt.savefig(f"2_plots/{name}/{gene_id[-1]}_TMM_crucian_carp_data_normalized.svg")
        plt.savefig(f"2_plots/{name}/{gene_id[-1]}_TMM_crucian_carp_data_normalized.png")
        plt.close()



  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()
  boot_dist.append(f(*sample, **func_kwargs))
  return _nanquantile_unchecked(
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout

In [15]:
import seaborn as sns 
import matplotlib as mpl
import matplotlib.pyplot as plt

# set sns colorpallete to colorblind 
sns.set_palette('colorblind')

# ABSOLUTE FOLD CHANGE PLOT

for name, df in tmm.items():
    grouped = df.groupby(['gene_id'])
    for gene_id, data in grouped:
        g = sns.catplot(data = data,  order=['rna', 'ribo'], x='kind', y='measurement', hue='condition', hue_order=['normoxia', 'anoxia', '3h_reoxygenation', '24h_reoxygenation'], kind='bar', legend=False)
        sns.swarmplot(data = data, x='kind',  order=['rna', 'ribo'], y='measurement', hue='condition', hue_order=['normoxia', 'anoxia', '3h_reoxygenation', '24h_reoxygenation'], dodge=True, legend=False, linewidth=0.7, edgecolor="k", ax=g.ax, alpha=0.7)
        ticks = g.ax.get_xticks()
        plt.xticks(ticks, ["RNA", "RFP"])
        plt.legend()
        plt.ylabel("Normalized Expression")
        plt.xlabel("Method")
        sns.move_legend(g.ax, bbox_to_anchor=(1, 1),loc="upper left", title="Conditions:")
        plt.title(gene_id, fontdict={'weight': 'bold'})
        plt.tight_layout()
        plt.savefig(f"2_plots/{name}/{gene_id[-1]}_TMM_crucian_carp_data_absolute.svg")
        plt.savefig(f"2_plots/{name}/{gene_id[-1]}_TMM_crucian_carp_data_absolute.png")
        plt.close()

  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()
  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout()
  self._fi