In [4]:
import pybedtools

import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt

import seaborn as sns
import seaborn.objects as so

from scipy.stats import pearsonr, spearmanr
import glob




# 1. Analysis of homotypic TFBSs
    e.g ESR1
    - From TFBS-BED get, GeneID, TSS_dist, homotypic_count for each ESR1 in the region 
    - Split data in 4 bins, so that ESR1 is (0-50, 50-100, 100-150, 150-200)bp away from the TSS
    - Generate Table per bin with number of TFBS in col_1 and Genexpresion in col_2. Every Tissue can be a seperate row.

In [None]:
data = pybedtools.BedTool("/sybig/projects/GeneRegulation/data/jme/Bachelorarbeit/data/Promotor_with_TFBS/New_TFBS_BED/Prom_with_TFBSs.bed")
gtex_df = pd.read_csv("/sybig/projects/GeneRegulation/data/jme/Bachelorarbeit/data/GTEx_GenExpr_ucsc.csv", sep=",")
gtex_df.columns

## Using single_tfbs_to_csv.py

The script generates for every TF a own csv file with all important information. 

python single_tfbs_to_csv.py -f /sybig/projects/GeneRegulation/data/jme/Bachelorarbeit/data/Promotor_with_TFBS/New_TFBS_BED/Prom_with_TFBSs.bed -out /sybig/projects/GeneRegulation/data/jme/Bachelorarbeit/data/Single_TFBS/Protein_Region_single_tfbs -geneType "protein_coding"

This will lead to a folder with all important csv files.

In [None]:
csv_folder = "/sybig/projects/GeneRegulation/data/jme/Bachelorarbeit/data/Single_TFBS/Protein_Region_single_TFBS_with_GTEx"
tf_name = "ESR1"
csv_file = f"{csv_folder}/{tf_name}.csv"

In [None]:
def get_df_for_single_tfbs(csv_file):
    # Specify column names
    columns_01 = ["chr","geneID","tf","close_tss","dist_tss","strand_orientation","homotypic_count","all_tfbs_count","all_tfbs_unique_count"]
    columns_02 = ['Adipose - Subcutaneous', 'Adipose - Visceral (Omentum)',
       'Adrenal Gland', 'Artery - Aorta', 'Artery - Coronary',
       'Artery - Tibial', 'Bladder', 'Brain - Amygdala',
       'Brain - Anterior cingulate cortex (BA24)',
       'Brain - Caudate (basal ganglia)', 'Brain - Cerebellar Hemisphere',
       'Brain - Cerebellum', 'Brain - Cortex', 'Brain - Frontal Cortex (BA9)',
       'Brain - Hippocampus', 'Brain - Hypothalamus',
       'Brain - Nucleus accumbens (basal ganglia)',
       'Brain - Putamen (basal ganglia)', 'Brain - Spinal cord (cervical c-1)',
       'Brain - Substantia nigra', 'Breast - Mammary Tissue',
       'Cells - Cultured fibroblasts', 'Cells - EBV-transformed lymphocytes',
       'Cervix - Ectocervix', 'Cervix - Endocervix', 'Colon - Sigmoid',
       'Colon - Transverse', 'Esophagus - Gastroesophageal Junction',
       'Esophagus - Mucosa', 'Esophagus - Muscularis', 'Fallopian Tube',
       'Heart - Atrial Appendage', 'Heart - Left Ventricle', 'Kidney - Cortex',
       'Kidney - Medulla', 'Liver', 'Lung', 'Minor Salivary Gland',
       'Muscle - Skeletal', 'Nerve - Tibial', 'Ovary', 'Pancreas', 'Pituitary',
       'Prostate', 'Skin - Not Sun Exposed (Suprapubic)',
       'Skin - Sun Exposed (Lower leg)', 'Small Intestine - Terminal Ileum',
       'Spleen', 'Stomach', 'Testis', 'Thyroid', 'Uterus', 'Vagina',
       'Whole Blood']
    columns = columns_01 + list(columns_02)
    # Read single csv file
    tfbs_df = pd.read_csv(csv_file, names=columns)
    return tfbs_df

In [None]:
tfbs = get_df_for_single_tfbs(csv_file)
tfbs

In [None]:
geneexpr = tfbs.iloc[:,9:]
tfbs

In Order to calculate all tissues as independend Datapoint, it is necessary to generate for each geneexpr value a own row.

In [None]:
def flatten_tissues_expand_df(tfbs_df):
    tfbs_part_repeat = tfbs_df.iloc[:,:9]
    tfbs_part_flatten = tfbs_df.iloc[:,9:]
    
    repeat_df =  pd.DataFrame(np.repeat(tfbs_part_repeat, len(tfbs_part_flatten.columns), axis=0))
    flatten_arr = tfbs_part_flatten.to_numpy().flatten()
    
    repeat_df.columns = tfbs_part_repeat.columns
    repeat_df["All_tissues"] = flatten_arr

    return repeat_df

#tfbs_exp_df = flatten_tissues_expand_df(tfbs)

Now The Spearman Correlation can be measured for each tfbs

In [None]:
def homotyic_pearson(tfbs_df):
    # First the Dataframe will be exoanded to include every tissue.
    expand_df = flatten_tissues_expand_df(tfbs_df)

    homotyic_count = expand_df.homotypic_count.to_numpy()
    geneexpr = expand_df.All_tissues.to_numpy()

    r,p_value = pearsonr(geneexpr, homotyic_count)

    return r,p_value
#r,p_value = homotyic_pearson(tfbs)


In [None]:
r,p_value


### Plot Genexpr for homotypic count

In [None]:
def plot_homotypic_count_to_expr(tfbs_df, output):
    _# first the Dataframe will be exoanded to include every tissue.
    expand_df = flatten_tissues_expand_df(tfbs_df)

    homotyic_count = expand_df.homotypic_count.to_numpy()
    geneexpr = expand_df.All_tissues.to_numpy()

    # Split expr values according to homotypic count
    unique_count = np.unique(homotyic_count)
    geneexpr_2d_lst = []
    sample_sizes = []
    for i in unique_count:
        i_exp = geneexpr[homotyic_count == i]
        # log2 of i_exp:
        i_exp_non_zero = i_exp[i_exp!=0]
        i_exp_log = np.log2(i_exp_non_zero)

        geneexpr_2d_lst.append(list(i_exp_log))
        sample_sizes.append(len(i_exp_non_zero))
    
    fig, ax1 = plt.subplots()
    bp1 = ax1.boxplot(geneexpr_2d_lst, patch_artist=True,  labels=[f"{count}\n(n={size})" for count, size in zip(unique_count, sample_sizes)])
    ax1.set_xlabel("homotypic count")
    ax1.set_ylabel("log2(Expr)")
    ax1.set_xticklabels(unique_count)
    ax1.set_xlim(0, len(unique_count)+0.5)
    ax1.set_ylim(min([min(box) for box in geneexpr_2d_lst])- 4, max([max(box) for box in geneexpr_2d_lst])+ 4)
     # Add labels on top of each boxplot
    for i, box in enumerate(bp1['boxes']):
        # Get the x and y coordinates of the box
        xpos = i+1
        ypos = max(geneexpr_2d_lst[i])+1
        
        # Add label
        ax1.text(xpos, ypos, f"n={sample_sizes[i]}", ha='center', va='bottom', color='black', fontsize="x-small")

    plt.title("")
    fig.savefig(output)
    plt.show()

    return 

In [None]:
output = "/sybig/projects/GeneRegulation/data/jme/Bachelorarbeit/data/Single_TFBS/Boxplot/ESR1_GeneExpr_by_count.png"
plot_homotypic_count_to_expr(tfbs, output)

# Calculate Pearson (Spearman) Coeffizient for all TFBSs

In [None]:
csv_folder = "/sybig/projects/GeneRegulation/data/jme/Bachelorarbeit/data/Single_TFBS/Protein_Region_single_TFBS_with_GTEx"


Calculate pearson coff and plot it

In [None]:
def get_pearson_for_single_tf(csv_file):
    tfbs_df = get_df_for_single_tfbs(csv_file)
    tf_name = tfbs_df.tf.unique()[0]
    tfbs_df_flatten = flatten_tissues_expand_df(tfbs_df)
    r, p_value = homotyic_pearson(tfbs_df_flatten)
    return tf_name, r, p_value
    

In [None]:
def get_pearson_for_all_tf(csv_folder):
    tf_paths = glob.glob(f"{csv_folder}/*.csv")

    tf_dict = dict()
    #p_value_dict = dict()
    for tf_path in tf_paths:
        tf_name, r, p_value = get_pearson_for_single_tf(tf_path)
        tf_dict[tf_name] = r, p_value
        # p_value_dict[tf_name] = p_value
    return tf_dict
        
    

In [None]:
tf_dict = get_pearson_for_all_tf(csv_folder)
tf_dict

In [None]:
tf_names = [i[0] for i in tf_dict.items()]
r_values = [i[1][0] for i in tf_dict.items()]
p_values_log = [-np.log10(i[1][1]) for i in tf_dict.items()]

# Sort TF by log p_value and filter put NaN
df = pd.DataFrame({"TFs":tf_names, "r":r_values, "p_values": p_values_log})
df = df.sort_values(by="p_values", ascending=False)
df = df[df.r.notnull()]
df


In [None]:
# inf in df change for spearman
# max_value = np.nanmax(df.p_values[df.p_values != np.inf])
# df.p_values.replace([np.inf], max_value+30, inplace=True)
# df

# Change df 

#best 30 p_val
df = df.iloc[:70] 

# p_value over -log(0.05)
#df = df[df.p_values > -np.log10(0.05)]




In [None]:
x = df.TFs.to_numpy()
p_value = df.p_values.to_numpy()
r_value = df.r.to_numpy()

min_r = min(r_value)
max_r = max(r_value)

# Create color map for bars based on r_value
colors = plt.cm.viridis(r_value / r_value.max())  # Normalize r_value to range 0-1

fig, ax = plt.subplots(figsize=(6,15))

ax.barh(x, p_value, label="", color= colors,  align='center')
ax.invert_yaxis()

# Add color bar
cbar = plt.colorbar(plt.cm.ScalarMappable(cmap='viridis'),ax=ax,  orientation="horizontal", location="top")
# fraction=0.009, pad=-0.03
cbar.set_label('Pearson Correlation')
cbar.set_ticklabels(np.round(np.arange(min_r, max_r, (max_r - min_r)/6),4))

#ax.set_yticks(np.arange(len(x)), x)
ax.set_xlabel("-log10(p-value)")
ax.set_ylabel("TFBSs")
ax.grid()
fig.savefig("/sybig/projects/GeneRegulation/data/jme/Bachelorarbeit/data/Single_TFBS/Boxplot/Spearman_best_70_TFBS.png")



In [None]:
df

## Extract Genexpression

In [None]:
def get_GeneExpr_for_geneIds(gtex_df, geneIds):
    gene_expr = np.empty((0,len(gtex_df.columns)-1))
    for id in geneIds:
        red_df = gtex_df[gtex_df.Name == id]
        if len(red_df) > 0 :
         expr = red_df.to_numpy()[0][1:]
        else:
            print(f"For {id} was no Expression Data found.")
            expr = np.zeros((1,len(gtex_df.columns)-1))

        gene_expr = np.vstack((gene_expr , expr))    
    return gene_expr


In [None]:
def get_GeneExpr_for_geneId(gtex_df, geneId):
        
    red_df = gtex_df[gtex_df.Name == geneId]
    if len(red_df) > 0 :
        expr = red_df.to_numpy()[0][1:]
    else:
        print(f"For {geneId} was no Expression Data found.")
        expr = np.empty((1,len(gtex_df.columns)-1))
        expr[:] = np.nan  
    return expr

# Calculate Genexpression for all TFBS

In [8]:
csv_folder = "/sybig/projects/GeneRegulation/data/jme/Bachelorarbeit/data/Single_TFBS/Protein_Region_single_TFBS_with_GTEx"


In [None]:
def get_flat_Expr_for_single_tfbs(csv_file):
    tfbs_df = get_df_for_single_tfbs(csv_file)
    tf_name = tfbs_df.tf.unique()[0]
    tfbs_df_flatten = flatten_tissues_expand_df(tfbs_df)

    expr = tfbs_df_flatten.All_tissues.to_numpy()
    
    return tf_name, expr

In [None]:
get_flat_Expr_for_single_tfbs(f"{csv_folder}/ESR1.csv")

In [None]:
def get_flat_Expr_all_tfbs(csv_folder):
    tf_paths = glob.glob(f"{csv_folder}/*.csv")

    tf_dict = dict()
    for tf_path in tf_paths:
        tf_name, expr = get_flat_Expr_for_single_tfbs(tf_path)
        tf_dict[tf_name] = expr
    return tf_dict

In [None]:
expr_all_tf_dict = get_flat_Expr_all_tfbs(csv_folder)

In [None]:
import seaborn as sns

tfbs_df = get_df_for_single_tfbs(csv_file)
tfbs_df_flatten = flatten_tissues_expand_df(tfbs_df)
tfbs_df_flatten

In [None]:
new_df = pd.concat(tfbs_df_flatten, tfbs_df_flatten, ignore_index=True)

In [None]:
def plot_expr_all_tf(expr_dict):

    expr_dict_sort = dict(sorted(expr_dict.items(), key=lambda x: np.median(np.log2(x[1]))))

    labels, data = [*zip(*expr_dict_sort.items())]

    #log2 from data, note, that all 0 expr will be removed
    data_log = tuple([np.log2(i) for i in data])

    fig, ax = plt.subplots(figsize=(10,48))
    ax.boxplot(data_log, len(labels)+1, labels=labels, vert=False)
    ax.grid()

    return

In [None]:
plot_expr_all_tf(expr_all_tf_dict)

In [None]:
data

In [7]:
def get_df_for_all_tfbs(csv_folder):
    # Specify column names
    columns_01 = ["chr","geneID","tf","close_tss","dist_tss","strand_orientation","homotypic_count","all_tfbs_count","all_tfbs_unique_count"]
    columns_02 = ['Adipose - Subcutaneous', 'Adipose - Visceral (Omentum)',
       'Adrenal Gland', 'Artery - Aorta', 'Artery - Coronary',
       'Artery - Tibial', 'Bladder', 'Brain - Amygdala',
       'Brain - Anterior cingulate cortex (BA24)',
       'Brain - Caudate (basal ganglia)', 'Brain - Cerebellar Hemisphere',
       'Brain - Cerebellum', 'Brain - Cortex', 'Brain - Frontal Cortex (BA9)',
       'Brain - Hippocampus', 'Brain - Hypothalamus',
       'Brain - Nucleus accumbens (basal ganglia)',
       'Brain - Putamen (basal ganglia)', 'Brain - Spinal cord (cervical c-1)',
       'Brain - Substantia nigra', 'Breast - Mammary Tissue',
       'Cells - Cultured fibroblasts', 'Cells - EBV-transformed lymphocytes',
       'Cervix - Ectocervix', 'Cervix - Endocervix', 'Colon - Sigmoid',
       'Colon - Transverse', 'Esophagus - Gastroesophageal Junction',
       'Esophagus - Mucosa', 'Esophagus - Muscularis', 'Fallopian Tube',
       'Heart - Atrial Appendage', 'Heart - Left Ventricle', 'Kidney - Cortex',
       'Kidney - Medulla', 'Liver', 'Lung', 'Minor Salivary Gland',
       'Muscle - Skeletal', 'Nerve - Tibial', 'Ovary', 'Pancreas', 'Pituitary',
       'Prostate', 'Skin - Not Sun Exposed (Suprapubic)',
       'Skin - Sun Exposed (Lower leg)', 'Small Intestine - Terminal Ileum',
       'Spleen', 'Stomach', 'Testis', 'Thyroid', 'Uterus', 'Vagina',
       'Whole Blood']
    columns = columns_01 + list(columns_02)

    # Get a list of all CSV files in a directory
    csv_files = glob.glob(f"{csv_folder}/*.csv")

    # Create an empty dataframe to store the combined data
    combined_df = pd.DataFrame()

    # Loop through each CSV file and append its contents to the combined dataframe
    for csv_file in csv_files:
        df = pd.read_csv(csv_file)
        df.columns = columns
        combined_df = pd.concat([combined_df, df])
    combined_df.columns = columns
    return combined_df

        

In [6]:
columns_01 = ["chr","geneID","tf","close_tss","dist_tss","strand_orientation","homotypic_count","all_tfbs_count","all_tfbs_unique_count"]
columns_02 = ['Adipose - Subcutaneous', 'Adipose - Visceral (Omentum)',
    'Adrenal Gland', 'Artery - Aorta', 'Artery - Coronary',
    'Artery - Tibial', 'Bladder', 'Brain - Amygdala',
    'Brain - Anterior cingulate cortex (BA24)',
    'Brain - Caudate (basal ganglia)', 'Brain - Cerebellar Hemisphere',
    'Brain - Cerebellum', 'Brain - Cortex', 'Brain - Frontal Cortex (BA9)',
    'Brain - Hippocampus', 'Brain - Hypothalamus',
    'Brain - Nucleus accumbens (basal ganglia)',
    'Brain - Putamen (basal ganglia)', 'Brain - Spinal cord (cervical c-1)',
    'Brain - Substantia nigra', 'Breast - Mammary Tissue',
    'Cells - Cultured fibroblasts', 'Cells - EBV-transformed lymphocytes',
    'Cervix - Ectocervix', 'Cervix - Endocervix', 'Colon - Sigmoid',
    'Colon - Transverse', 'Esophagus - Gastroesophageal Junction',
    'Esophagus - Mucosa', 'Esophagus - Muscularis', 'Fallopian Tube',
    'Heart - Atrial Appendage', 'Heart - Left Ventricle', 'Kidney - Cortex',
    'Kidney - Medulla', 'Liver', 'Lung', 'Minor Salivary Gland',
    'Muscle - Skeletal', 'Nerve - Tibial', 'Ovary', 'Pancreas', 'Pituitary',
    'Prostate', 'Skin - Not Sun Exposed (Suprapubic)',
    'Skin - Sun Exposed (Lower leg)', 'Small Intestine - Terminal Ileum',
    'Spleen', 'Stomach', 'Testis', 'Thyroid', 'Uterus', 'Vagina',
    'Whole Blood']
columns = columns_01 + list(columns_02)


csv_files = glob.glob(f"{csv_folder}/*.csv")
test_files = csv_files[:2]
c_df = pd.DataFrame()
for file in test_files:
    df = pd.read_csv(file)
    df.columns = columns
    c_df = pd.concat([c_df,df])


NameError: name 'csv_folder' is not defined

In [10]:
combined_df = get_df_for_all_tfbs(csv_folder)

In [None]:
combined_df

In [5]:
sns.boxplot(combined_df.tf, combined_df.Vagina)

NameError: name 'combined_df' is not defined