# Analyzing TF motifs Across Different Alu Elements 

Abin Abraham 
Created: 2018-04-27 10:59:06

** - dedicated script to generate full heatmap **

Alu Elements Analyzed in Su, M. et al.
    - AluYa5
    - AluYb8
    - AluSp
    - AluY
    - AluSc
    - AluSg
    - AluSq
    - AluSx
    - AluJb
    - AluJo

Reference: Su, M. et al., 2014. Evolution of Alu elements toward enhancers. CellReports, 7(2), pp.376–385.

In [3]:
import os, sys
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

%matplotlib inline 
np.set_printoptions(precision=5, suppress=True)  # suppress scientific float notation

In [4]:
### FILE PATHS 
ROOT_PATH_TE_FIMO = "/dors/capra_lab/users/abraha1/projects/transposable_elements/data/fimo_repeatmasker/fimo_output/individualTE_fimo_output"
TE_FIMO_FILE = {    
    "AluYa5":"fimo_AluYa5.txt",
    "AluYb8":"fimo_AluYb8.txt",
    "AluSp":"fimo_AluSp.txt",
    "AluY":"fimo_AluY.txt",
    "AluSc":"fimo_AluSc.txt",
    "AluSg":"fimo_AluSg.txt",
    "AluSq":"fimo_AluSq.txt",
    "AluSx":"fimo_AluSx.txt",
    "AluJb":"fimo_AluJb.txt",
    "AluJo":"fimo_AluJo.txt"} 

In [5]:
### LOAD FILE PATHS 
df = pd.read_csv(os.path.join(ROOT_PATH_TE_FIMO, TE_FIMO_FILE['AluJo']), sep="\t", header=None)

In [6]:
### CLEAN UP DATA
df.columns = ["TF", "TE", "TE_coordinates","TF_start_1based",
              "TF_end", "strand","motif_score","p_value",
              "motif_seq","q_value","num_bases_in_TE","num_bases_in_TFmotif"]

df = df.drop([ "motif_seq", "p_value", "strand","num_bases_in_TE","num_bases_in_TFmotif"], axis=1)
nodups_df = df.drop_duplicates(subset=['TF','TE_coordinates'], keep=False)

In [7]:
nodups_df.shape
df.shape
nodups_df['TF'].nunique()
nodups_df['TE_coordinates'].nunique()

(735240, 7)

(1009361, 7)

274

47062

NOTE: substantial number of rows are dropped when duplicate TE-TF pairs are removed.

In [10]:
## GROUPBY
agg_func = {'motif_score':['count','mean','median']}
# nodups_df.groupby(by=['TF','TE_coordinates']).agg(agg_func).swaplevel('TE_coordinates','TF')
# gb_df = nodups_df.groupby(by=['TE_coordinates','TF']).agg(agg_func).reset_index()

gb_df = nodups_df.groupby(by=['TE_coordinates','TF']).agg(agg_func).unstack(fill_value=0)

In [11]:
# gb_df.loc['chr10:100023624-100023915',('motif_score','count')].as_matrix()
count_df = gb_df.loc[:,('motif_score','count')]

### Heatmap of TF count ### 

In [12]:
import seaborn as sns; sns.set(color_codes=True)

In [None]:
g = sns.clustermap(gb_df)

