# Calculate RXE for public dataset, females only based on Apua's script
This has been edited from Apua's script.

In [None]:
import numpy as np
import pandas as pd
from os import environ
from plotnine import *
from pyhere import here
import re, session_info
import statsmodels.api as sm
from functools import lru_cache
from scipy.stats import variation
from warnings import filterwarnings
from scipy.stats import mannwhitneyu
from statsmodels.formula.api import ols
from matplotlib.cbook import mplDeprecation

In [None]:
filterwarnings("ignore",category=mplDeprecation)
filterwarnings('ignore', category=UserWarning, module='plotnine.*')
filterwarnings('ignore', category=DeprecationWarning, module='plotnine.*')

In [None]:
environ['NUMEXPR_MAX_THREADS'] = '4'

## Functions

In [None]:
@lru_cache()
def get_pheno():
    return pd.read_csv(here('input/phenotypes/_m/phenotypes.csv'), index_col=0)


@lru_cache()
def get_logTPM(tissue):
    fn = here(f'input/counts/text_files_counts/tpm/_m/{tissue}/gene.log2tpm.csv')
    return pd.read_csv(fn, index_col=0)


@lru_cache()
def get_annotation():
    """
    Get the annotation file for genes.
    """
    fn = here("input/counts/text_files_counts/_m",
              "caudate/gene_annotation.txt")
    return pd.read_csv(fn, sep="\t")

In [None]:
def save_ggplot(p, fn):
    for ext in ['.pdf', '.png', '.svg']:
        p.save(fn+ext, width=10, height=7)
        

def rge(df, chrom):
    return df[df['seqname']==chrom][samples].mean() \
    - df[(df['chrom_type']=='autosome') & (df['seqname']!=chrom)][samples].mean()


def annotate_chrom(df):
    df.loc[:, "chrom_types"] = "Other"
    df.loc[df["seqnames"].isin(["chrX", "chrY"]), "chrom_types"] = "Allosome"
    df.loc[df["seqnames"].str.contains("chr\d+"), "chrom_types"] = "Autosome"
    df.loc[df["seqnames"] == "chrM", "chrom_types"] = "Mitochondria"
    df.loc[df["seqnames"] == "chrX", "chrom_types"] = "X"
    return df

## Load and merge data

In [None]:
log2tpm = pd.DataFrame()
for tissue in ["caudate", "dlpfc", "hippocampus"]:
    log2tpm = pd.concat([log2tpm, get_logTPM(tissue)], axis=1)
print(log2tpm.shape)
log2tpm.iloc[0:2, 0:5]

## Annotate TPM

In [None]:
get_annotation().head(2)

In [None]:
df0 = log2tpm.merge(get_annotation().loc[:, ["name", "gene_name", "seqnames"]], 
                    left_index=True, right_on="name")
df0 = annotate_chrom(df0)
print(df0.shape)
df0.groupby('chrom_types').size()

## Filtering genes

In [None]:
samples = [x for x in log2tpm.columns if re.match('R\d+', x)]
df = df0[(df0[samples].sum(axis=1) >= 0.2 * len(samples)) & 
         (df0['chrom_types'].isin(["X", "Autosome"]))].copy()
df.shape

## Generate RXE

In [None]:
df1  = df.groupby('chrom_types').mean(numeric_only=True).transpose()
df1.head(2)

In [None]:
df1['sample'] = df1.index
df1['RXE'] = df1['X'] - df1['Autosome']
df1.head(2)

## Annotated sample IDs

In [None]:
df2 = df1.merge(get_pheno().loc[:, ['RNum', 'Sex', 'Dx', 'Region']], 
                left_index=True, right_on="RNum")
df2.loc[:, ['Region', 'Sex', 'RXE']].groupby(['Region', 'Sex']).describe()

## Metrics summary

### Variation of RXE

In [None]:
for tissue in ['Caudate', 'DLPFC', 'HIPPO']:
    for sex in ['F', 'M']:
        var = variation(df2[(df2['Region']==tissue) & (df2['Sex']==sex)].RXE)
        print(f"There is {var:.3} variation for {sex} in {tissue}.")

In [None]:
## Separate out tissues

caudate = df2[(df2['Region']=='Caudate')].copy()
dlpfc = df2[(df2['Region']=='DLPFC')].copy()
hippo = df2[(df2['Region']=='HIPPO')].copy()

### Mann-WhitneyU (Female to Male)

In [None]:
for tissue in ['caudate', 'DLPFC', "hippocampus"]:
    df_config = {'caudate': caudate,'DLPFC': dlpfc,'hippocampus': hippo}
    stat, pval = mannwhitneyu(df_config[tissue][(df_config[tissue]['Sex']=='F')].RXE, 
                              df_config[tissue][(df_config[tissue]['Sex']=='M')].RXE)
    print(f"Mann-WhitneyU for female vs male (RXE) for {tissue}: {pval:.4}")

### Mann-WhitneyU (CTL vs SZ)

In [None]:
for tissue in ['caudate', 'DLPFC', "hippocampus"]:
    df_config = {'caudate': caudate,'DLPFC': dlpfc,'hippocampus': hippo}
    stat, pval = mannwhitneyu(df_config[tissue][(df_config[tissue]['Dx']=='Control')].RXE, 
                              df_config[tissue][(df_config[tissue]['Dx']=='SCZD')].RXE)
    print(f"Mann-WhitneyU for ctl vs sz (RXE) for {tissue}: {pval:.4}")

### Subset by sex: diagnosis status

In [None]:
for tissue in ['caudate', 'DLPFC', "hippocampus"]:
    df_config = {'caudate': caudate,'DLPFC': dlpfc,'hippocampus': hippo}
    female = df_config[tissue][(df_config[tissue]['Sex']=='F')].copy()
    male = df_config[tissue][(df_config[tissue]['Sex']=='M')].copy()
    stat_f, pval_f = mannwhitneyu(female[(female['Dx']=='Control')].RXE, 
                                  female[(female['Dx']=='SCZD')].RXE)
    stat_m, pval_m = mannwhitneyu(male[(male['Dx']=='Control')].RXE, 
                                  male[(male['Dx']=='SCZD')].RXE)
    print(f"Mann-WhitneyU of female, ctl vs sz (RXE) for {tissue}: {pval_f:.4}")
    print(f"Mann-WhitneyU of male, ctl vs sz (RXE) for {tissue}: {pval_m:.4}")

### X chromosome expression differences

In [None]:
for tissue in ['caudate', 'DLPFC', "hippocampus"]:
    df_config = {'caudate': caudate,'DLPFC': dlpfc,'hippocampus': hippo}
    female = df_config[tissue][(df_config[tissue]['Sex']=='F')].copy()
    male = df_config[tissue][(df_config[tissue]['Sex']=='M')].copy()
    stat_f, pval_f = mannwhitneyu(female[(female['Dx']=='Control')].X, 
                                  female[(female['Dx']=='SCZD')].X)
    stat_m, pval_m = mannwhitneyu(male[(male['Dx']=='Control')].X, 
                                  male[(male['Dx']=='SCZD')].X)
    print(f"Mann-WhitneyU of female, ctl vs sz (RXE) for {tissue}: {pval_f:.4}")
    print(f"Mann-WhitneyU of male, ctl vs sz (RXE) for {tissue}: {pval_m:.4}")

### Autosome expression

In [None]:
for tissue in ['caudate', 'DLPFC', "hippocampus"]:
    df_config = {'caudate': caudate,
                 'DLPFC': dlpfc, 
                 'hippocampus': hippo}
    female = df_config[tissue][(df_config[tissue]['Sex']=='F')].copy()
    male = df_config[tissue][(df_config[tissue]['Sex']=='M')].copy()
    stat_f, pval_f = mannwhitneyu(female[(female['Dx']=='Control')].Autosome, 
                                  female[(female['Dx']=='SCZD')].Autosome)
    stat_m, pval_m = mannwhitneyu(male[(male['Dx']=='Control')].Autosome, 
                                  male[(male['Dx']=='SCZD')].Autosome)
    print(f"Mann-WhitneyU of female, ctl vs sz (RXE) for {tissue}: {pval_f:.4}")
    print(f"Mann-WhitneyU of male, ctl vs sz (RXE) for {tissue}: {pval_m:.4}")

### Interaction model

In [None]:
df2.Sex = df2.Sex.astype("category").cat.rename_categories({"F": "Female", "M": "Male"})
df2.Dx = df2.Dx.astype("category").cat.rename_categories({"SCZD": "SZ", "Control": "CTL"})
df2.head(2)

In [None]:
df3 = df2.reset_index()[['RNum', 'RXE', 'Sex', 'Dx', 'Region']].set_index("RNum")
df3[df3.columns[1]] = df3.Sex.cat.codes
df3[df3.columns[2]] = df3.Dx.cat.codes
df3[df3.columns[3]] = df3.Region.astype("category").cat.codes
df3.head(2)

In [None]:
anova_df = df2.loc[:,['RXE','Sex','Dx','Region']]

# ANOVA results with combinations of 2 groups:
formula = 'RXE ~ C(Sex) + C(Dx) + C(Region) + C(Sex):C(Dx) + C(Sex):C(Region) + C(Dx):C(Region)'
lm = ols(formula, anova_df).fit()
table = sm.stats.anova_lm(lm, typ=2)
print(table)

#### By tissue interaction

In [None]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd
# Assumes parametric

for tissue in ['caudate', 'DLPFC', "hippocampus"]:
    df_config = {'caudate': caudate,'DLPFC': dlpfc,'hippocampus': hippo}
    anova_df = df_config[tissue].loc[:, ['RXE', 'Sex', 'Dx']]
    # ANOVA results with combinations of 2 groups:
    formula = 'RXE ~ C(Sex) + C(Dx) + C(Sex):C(Dx)'
    lm = ols(formula, anova_df).fit()
    table = sm.stats.anova_lm(lm, typ=2)
    print(tissue)
    print(table)
    print("\n")
    anova_df['combination'] = anova_df.Sex.astype("str") + " / " + anova_df.Dx.astype("str")
    anova_df['combination'] = anova_df.Sex.astype("str") + " / " + anova_df.Dx.astype("str")
    # perform multiple pairwise comparison (Tukey HSD)
    m_comp = pairwise_tukeyhsd(endog=anova_df['RXE'], groups=anova_df['combination'], alpha=0.05)
    # coerce the tukeyhsd table to a DataFrame
    tukey_data = pd.DataFrame(data=m_comp._results_table.data[1:], columns = m_comp._results_table.data[0])
    if tukey_data[(tukey_data['p-adj'] < 0.05)].shape[0] == 0:
        print("There is no significant interactions!")
    else:
        print(tukey_data[(tukey_data['p-adj'] < 0.05)])
    print("\n")

## Plot RXE

In [None]:
p = ggplot(df2, aes(x='Region', y='X', fill='Dx')) \
    + geom_boxplot() \
    + facet_grid("~Sex")\
    + ylab("X Chromosome Expression")\
    + theme_matplotlib()
p

In [None]:
p = ggplot(df2, aes(x='Region', y='Autosome', fill='Dx')) \
    + geom_boxplot() \
    + facet_grid("~Sex")\
    + ylab("Autosome Expression")\
    + theme_matplotlib()
p

In [None]:
p = ggplot(df2, aes(x='Region', y='RXE', fill='Dx')) \
    + geom_boxplot() \
    + facet_grid("~Sex")\
    + ylab("Relative X Expression")\
    + theme_matplotlib()
p

In [None]:
p = ggplot(df2, aes(x='sample', y='RXE', fill='Dx', shape='Region')) \
+ geom_point() + ylim([-0.25, 0.5]) + xlab("Samples")\
+ ylab("Relative X Expression") + facet_grid("Sex~.")\
+ theme_matplotlib() \
+ theme(axis_text_x=element_blank(), 
        axis_text=element_text(size=13), 
        legend_title=element_text(size=15, face="bold"), 
        legend_text=element_text(size=13),
        axis_title=element_text(size=16, face="bold"), 
        strip_text=element_text(size=14, face="bold"))
p

In [None]:
df2.to_csv('RXE_public.csv')
save_ggplot(p, "RXE_public_bySex")

## Session information

In [None]:
session_info.show()