In [1]:
import pandas as pd
import os
import shutil


ChIP_folder = "/data/home/natant/Negatives/Data/Encode690/ENCODE_hg38"
metadata_file = "/data/home/natant/Negatives/Data/Encode690/ENCODE_hg38/metadata.csv"
metadata_file_maxatac = "/data/home/natant/Negatives/Data/maxATAC/maxatac_chip.csv"


In [2]:

# Load the metadata files
metadata_df = pd.read_csv(metadata_file)
metadata_maxatac_df = pd.read_csv(metadata_file_maxatac, sep='\t')

# Function to extract text before the first underscore, remove "-", and make it lowercase
def extract_tf_name(tf):
    return tf.split('_')[0].replace('-', '').lower()

# Apply the function to both metadata_maxatac_df and df
metadata_maxatac_df["tf_base"] = metadata_maxatac_df["tf"].apply(extract_tf_name)
metadata_df["antibody_base"] = metadata_df["antibody"].apply(extract_tf_name)

# Get unique base TFs from metadata_maxatac_df and df
maxatac_tfs_base = set(metadata_maxatac_df["tf_base"].unique())
df_tfs_base = set(metadata_df["antibody_base"].unique())

# Find common base TFs
common_tfs_base = maxatac_tfs_base.intersection(df_tfs_base)

# Find unique base TFs in metadata_maxatac_df
unique_maxatac_tfs_base = maxatac_tfs_base - df_tfs_base

# Find unique base TFs in df
unique_df_tfs_base = df_tfs_base - maxatac_tfs_base

# Print the results
print(f"Number of common base TFs: {len(common_tfs_base)}")
print(f"Number of unique base TFs in metadata_maxatac_df: {len(unique_maxatac_tfs_base)}")
print(f"Number of unique base TFs in df: {len(unique_df_tfs_base)}")

# Optionally, print the unique base TFs
print(f"Unique base TFs in metadata_maxatac_df: {unique_maxatac_tfs_base}")
print(f"Unique base TFs in df: {unique_df_tfs_base}")

# Create a list to store the matched TFs
matched_tfs_list = []
for tf_base in common_tfs_base:
    maxatac_tf = metadata_maxatac_df[metadata_maxatac_df["tf_base"] == tf_base]["tf"].values[0]
    df_tf = metadata_df[metadata_df["antibody_base"] == tf_base]["antibody"].values[0]
    matched_tfs_list.append({"maxatac_tf": maxatac_tf, "df_tf": df_tf})

# Convert the list to a DataFrame
matched_tfs = pd.DataFrame(matched_tfs_list)

# Display the matched TFs
print(matched_tfs)



Number of common base TFs: 60
Number of unique base TFs in metadata_maxatac_df: 67
Number of unique base TFs in df: 110
Unique base TFs in metadata_maxatac_df: {'atf7', 'znf687', 'crem', 'arnt', 'lef1', 'znf592', 'tcf7', 'junb', 'nr2f1', 'myb', 'gabpa', 'nr2c2', 'smad5', 'e4f1', 'zhx2', 'zscan29', 'nfe2l2', 'foxp1', 'runx1', 'skil', 'pax8', 'gata4', 'mnt', 'rela', 'srebf1', 'cbx2', 'sox6', 'etv6', 'ash1l', 'zbed1', 'znf207', 'kmt2a', 'znf569', 'znf384', 'hes1', 'mbd2', 'nkrf', 'zfx', 'zbtb40', 'esrra', 'klf5', 'nr2c1', 'spi1', 'jun', 'cebpz', 'myc', 'znf24', 'elf4', 'neurod1', 'e2f8', 'srebf2', 'fos', 'rfx1', 'foxk2', 'nfatc3', 'rest', 'nfia', 'smad1', 'ybx1', 'zbtb11', 'znf282', 'gatad2b', 'cux1', 'znf407', 'nr2f6', 'nfxl1', 'pknox1'}
Unique base TFs in df: {'nfkb', 'grp20', 'sin3ak20', 'gata2', 'mta3', 'nrsf', 'gr', 'egfphdac8', 'hdac1', 'cmyc', 'sap30', 'ubf', 'tr4', 'pu.1', 'p300', 'sp4', 'foxp2', 'brf1', 'rpc155', 'irf4', 'pou2f2', 'baf155', 'bdp1', 'pol3', 'egfpjunb', 'jarid1a', 

In [4]:
# Function to normalize cell type names by making them lowercase and removing "-"
def normalize_cell_name(cell):
    return cell.replace('-', '').lower()

# Apply the function to both metadata_maxatac_df and df
metadata_maxatac_df["cell_base"] = metadata_maxatac_df["cell"].apply(normalize_cell_name)
metadata_df["cell_base"] = metadata_df["cell"].apply(normalize_cell_name)

# Get unique normalized cell types from metadata_maxatac_df and df
maxatac_cells_base = set(metadata_maxatac_df["cell_base"].unique())
df_cells_base = set(metadata_df["cell_base"].unique())

# Find common normalized cell types
common_cells_base = maxatac_cells_base.intersection(df_cells_base)

# Find unique normalized cell types in metadata_maxatac_df
unique_maxatac_cells_base = maxatac_cells_base - df_cells_base

# Find unique normalized cell types in df
unique_df_cells_base = df_cells_base - maxatac_cells_base

# Print the results
print(f"Number of common cell types: {len(common_cells_base)}")
print(f"Number of unique cell types in metadata_maxatac_df: {len(unique_maxatac_cells_base)}")
print(f"Number of unique cell types in df: {len(unique_df_cells_base)}")

# Optionally, print the unique cell types
print(f"Unique cell types in metadata_maxatac_df: {unique_maxatac_cells_base}")
print(f"Unique cell types in df: {unique_df_cells_base}")
# Create a list to store the matched cell types
matched_cells_list = []
for cell_base in common_cells_base:
    maxatac_cell = metadata_maxatac_df[metadata_maxatac_df["cell_base"] == cell_base]["cell"].values[0]
    df_cell = metadata_df[metadata_df["cell_base"] == cell_base]["cell"].values[0]
    matched_cells_list.append({"maxatac_cell": maxatac_cell, "df_cell": df_cell})

# Convert the list to a DataFrame
matched_cells = pd.DataFrame(matched_cells_list)

# Display the matched cell types
print(matched_cells)

# Get the original names of the unmatched cell types in metadata_maxatac_df
unmatched_maxatac_cells = metadata_maxatac_df[metadata_maxatac_df["cell_base"].isin(unique_maxatac_cells_base)]["cell"].unique()

# Get the original names of the unmatched cell types in df
unmatched_df_cells = metadata_df[metadata_df["cell_base"].isin(unique_df_cells_base)]["cell"].unique()

# Print the results
print(f"Unmatched cell types in metadata_maxatac_df: {unmatched_maxatac_cells}")
print(f"Unmatched cell types in df: {unmatched_df_cells}")

Number of common cell types: 10
Number of unique cell types in metadata_maxatac_df: 10
Number of unique cell types in df: 81
Unique cell types in metadata_maxatac_df: {'lovo', 'thp1', 'jurkat', 'rpmi8402', 'hela', 'wa09', 'gm23338', 'hek293t', 'mv411', 'pc3'}
Unique cell types in df: {'u87', 'hcpepic', 'gm06990', 'hcfaa', 'gm19193', 'u2os', 'ag09319', 'gm12864', 'gm10847', 'gliobla', 'sknsh_ra', 'hsmm', 'ag09309', 'helas3', 'pbde', 'nha', 'ag04449', 'hl60', 'caco2', 'pfsk1', 'hpf', 'hff', 'be2_c', 'dnd41', 'huvec', 'gm12891', 'aoaf', 'hbmec', 'hmf', 'gm18505', 'gm19240', 'gm12873', 'hrpepic', 'hasp', 'gm12875', 'gm19099', 'sknmc', 'ecc1', 'nhek', 'saec', 'hre', 'gm12865', 'ag10803', 'ag04450', 'pbdefetal', 'progfib', 'hac', 'nb4', 'nhlf', 'gm18951', 'gm19239', 'fibrobl', 'hsmmtube', 'gm15510', 'hpaf', 'gm12892', 'mcf10aersrc', 'h1hesc', 'nhdfneo', 'gm19238', 'gm12872', 'rptec', 'nt2d1', 'wi38', 'gm12874', 'gm12801', 'nhdfad', 'osteobl', 'hvmf', 'gm18526', 'hek293trex', 'heepic', 'hmec'

In [5]:
# Filter df to include only matched cells
matched_cells_set = set(matched_cells["df_cell"])
df_filtered_cells = metadata_df[metadata_df["cell"].isin(matched_cells_set)]

# Filter df_filtered_cells to include only matched TFs
matched_tfs_set = set(matched_tfs["df_tf"])
df_filtered = df_filtered_cells[df_filtered_cells["antibody"].isin(matched_tfs_set)]

# Display the filtered dataframe
df_filtered
# Add columns for corresponding celltype and tf name in the maxatac dataset
df_filtered = df_filtered.merge(matched_cells, left_on='cell', right_on='df_cell', how='left')
df_filtered = df_filtered.merge(matched_tfs, left_on='antibody', right_on='df_tf', how='left')


In [6]:
df_filtered_cells

Unnamed: 0,project,lab,composite,dataType,view,cell,treatment,antibody,control,dataVersion,...,controlId,quality,tableName,type,md5sum,size,old_filename,filename,antibody_base,cell_base
2,wgEncode,Broad,wgEncodeAwgTfbsUniform,ChipSeq,Peaks,GM12878,,CTCF,std,ENCODE Mar 2012 Freeze,...,wgEncodeEH000037,good,wgEncodeAwgTfbsBroadGm12878CtcfUniPk,narrowPeak,fef0af7fe1e724159e665085da53efb1,982K,wgEncodeAwgTfbsBroadGm12878CtcfUniPk.narrowPea...,GM12878__CTCF__None.narrowPeak,ctcf,gm12878
3,wgEncode,Broad,wgEncodeAwgTfbsUniform,ChipSeq,Peaks,GM12878,,EZH2_(39875),std,ENCODE Mar 2012 Freeze,...,wgEncodeEH000037,good,wgEncodeAwgTfbsBroadGm12878Ezh239875UniPk,narrowPeak,9a87661953cb6419847abd00599f0a30,66K,wgEncodeAwgTfbsBroadGm12878Ezh239875UniPk.narr...,GM12878__EZH2_(39875)__None.narrowPeak,ezh2,gm12878
12,wgEncode,Broad,wgEncodeAwgTfbsUniform,ChipSeq,Peaks,HepG2,,CTCF,std,ENCODE Mar 2012 Freeze,...,wgEncodeEH000084,good,wgEncodeAwgTfbsBroadHepg2CtcfUniPk,narrowPeak,6f8ab32864e515e0362494a3a8df2b8e,892K,wgEncodeAwgTfbsBroadHepg2CtcfUniPk.narrowPeak.gz,HepG2__CTCF__None.narrowPeak,ctcf,hepg2
13,wgEncode,Broad,wgEncodeAwgTfbsUniform,ChipSeq,Peaks,HepG2,,EZH2_(39875),std,ENCODE Mar 2012 Freeze,...,wgEncodeEH000084,caution,wgEncodeAwgTfbsBroadHepg2Ezh239875UniPk,narrowPeak,5e3ba666537059025883191074765280,85K,wgEncodeAwgTfbsBroadHepg2Ezh239875UniPk.narrow...,HepG2__EZH2_(39875)__None.narrowPeak,ezh2,hepg2
23,wgEncode,Broad,wgEncodeAwgTfbsUniform,ChipSeq,Peaks,K562,,CHD1_(A301-218A),std,ENCODE Mar 2012 Freeze,...,wgEncodeEH000052,good,wgEncodeAwgTfbsBroadK562Chd1a301218aUniPk,narrowPeak,a16380fc02b67d668afe29a58a2d429f,227K,wgEncodeAwgTfbsBroadK562Chd1a301218aUniPk.narr...,K562__CHD1_(A301-218A)__None.narrowPeak,chd1,k562
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
663,wgEncode,UW,wgEncodeAwgTfbsUniform,ChipSeq,Peaks,HCT-116,,CTCF,std,ENCODE Mar 2012 Freeze,...,wgEncodeEH000950,good,wgEncodeAwgTfbsUwHct116CtcfUniPk,narrowPeak,5353919b8dec58898d42391ed013ed3b,1006K,wgEncodeAwgTfbsUwHct116CtcfUniPk.narrowPeak.gz,HCT-116__CTCF__None.narrowPeak,ctcf,hct116
665,wgEncode,UW,wgEncodeAwgTfbsUniform,ChipSeq,Peaks,HEK293,,CTCF,std,ENCODE Mar 2012 Freeze,...,wgEncodeEH000464,good,wgEncodeAwgTfbsUwHek293CtcfUniPk,narrowPeak,85054a5025553ab029c7eb16c94dcbc4,1.1M,wgEncodeAwgTfbsUwHek293CtcfUniPk.narrowPeak.gz,HEK293__CTCF__None.narrowPeak,ctcf,hek293
667,wgEncode,UW,wgEncodeAwgTfbsUniform,ChipSeq,Peaks,HepG2,,CTCF,std,ENCODE Mar 2012 Freeze,...,wgEncodeEH000470,good,wgEncodeAwgTfbsUwHepg2CtcfUniPk,narrowPeak,3fdf2bfeeb9bf469510d615efec560ae,905K,wgEncodeAwgTfbsUwHepg2CtcfUniPk.narrowPeak.gz,HepG2__CTCF__None.narrowPeak,ctcf,hepg2
679,wgEncode,UW,wgEncodeAwgTfbsUniform,ChipSeq,Peaks,K562,,CTCF,std,ENCODE Mar 2012 Freeze,...,wgEncodeEH000471,good,wgEncodeAwgTfbsUwK562CtcfUniPk,narrowPeak,365af2fdea48c046dafad14b73f69b5d,850K,wgEncodeAwgTfbsUwK562CtcfUniPk.narrowPeak.gz,K562__CTCF__None.narrowPeak,ctcf,k562


In [None]:

# Write df_filtered to the output folder as meta.csv
meta_output_path = os.path.join(output_folder, "metadata.csv")
df_filtered.to_csv(meta_output_path, sep='\t', index=False)