In [1]:
# --- Imports ---
import pandas as pd

In [2]:
# Load K562 E-Gs
df1 = pd.read_csv('K562/K562.network.tsv', sep = '\t')

print(df1.shape)
df1.head(n = 5)

(78062, 9)


Unnamed: 0,chrom,chromStart,chromEnd,name,class,targetGene,targetEnsemblID,cellType,MPRabc
0,chr1,9863,10662,intergenic|chr1:9863-10662,intergenic,OR4F5,ENSG00000186092,K562,0.352643
1,chr1,9863,10662,intergenic|chr1:9863-10662,intergenic,FAM138F,ENSG00000282591,K562,0.45579
2,chr1,9863,10662,intergenic|chr1:9863-10662,intergenic,FAM138A,ENSG00000237613,K562,0.45579
3,chr1,29099,29599,intergenic|chr1:29099-29599,intergenic,OR4F5,ENSG00000186092,K562,0.312436
4,chr1,29099,29599,intergenic|chr1:29099-29599,intergenic,FAM138F,ENSG00000282591,K562,0.766666


In [4]:
# Load K562 TFs
df2 = pd.read_csv('K562/K562.FIMO.VERIFIED.tsv', sep = '\t')

print(df2.shape)
df2.head(n = 5)

(1472664, 10)


Unnamed: 0,chrom,chromStart,chromEnd,name,score,strand,motifID,motifAltID,pValue,matchedSequence
0,chr1,10104,10122,intergenic|chr1:9863-10662,10.7982,+,MA0073.2,RREB1,6.81e-06,ACCCAACCCTAACCCTAAC
1,chr1,10286,10304,intergenic|chr1:9863-10662,9.94495,+,MA0073.2,RREB1,9.29e-06,ACCCCAACCCCAACCCCAA
2,chr1,10287,10305,intergenic|chr1:9863-10662,14.0826,+,MA0073.2,RREB1,1.84e-06,CCCCAACCCCAACCCCAAC
3,chr1,10288,10306,intergenic|chr1:9863-10662,15.7523,+,MA0073.2,RREB1,9.13e-07,CCCAACCCCAACCCCAACC
4,chr1,10292,10310,intergenic|chr1:9863-10662,9.94495,+,MA0073.2,RREB1,9.29e-06,ACCCCAACCCCAACCCCAA


In [5]:
# Load K562 SNVs
df3 = pd.read_csv('variants/K562.variants.hg38.bed', sep = '\t', names = ['chrom', 'chromStart', 'chromEnd', 'name'])

print(df3.shape)
df3.head(n = 5)

(6803, 4)


Unnamed: 0,chrom,chromStart,chromEnd,name
0,chr1,2188393,2188394,rs2460002
1,chr1,2189185,2189186,rs2503700
2,chr1,2214725,2214726,rs78265569
3,chr1,2216434,2216435,rs10910028
4,chr1,2231567,2231568,rs10910031


In [6]:
# Filter FIMO
motifs = df2[df2['motifAltID'] == 'GATA1::TAL1'].copy()
motifs = motifs.rename(columns = {'name': 'region_name', 'chromStart': 'motifStart', 'chromEnd': 'motifEnd'})

In [7]:
# Format variants
variants = df3.rename(columns={'name': 'rsID', 'chromStart': 'varStart', 'chromEnd': 'varEnd'})

In [8]:
# Calculate overlaps
overlaps = (motifs.merge(variants, on = 'chrom', how = 'inner'))
overlaps = overlaps[(overlaps['varStart'] < overlaps['motifEnd']) & (overlaps['varEnd'] > overlaps['motifStart'])].copy()

print(f'Motf-variant overlaps: {overlaps.shape[0]}')

Motf-variant overlaps: 2


In [9]:
# Example regions
hit_regions = overlaps['region_name'].unique()
print(f'Example regionst: {len(hit_regions)}')

Example regionst: 2


In [10]:
# Results
df_final = df1[df1['name'].isin(hit_regions)].copy()
df_final

Unnamed: 0,chrom,chromStart,chromEnd,name,class,targetGene,targetEnsemblID,cellType,MPRabc
24836,chr15,31364317,31364817,genic|chr15:31364317-31364817,genic,KLF13,ENSG00000169926,K562,0.576051
24837,chr15,31364317,31364817,genic|chr15:31364317-31364817,genic,CHRNA7,ENSG00000175344,K562,0.693924
48819,chr21,34995969,34996728,intergenic|chr21:34995969-34996728,intergenic,RUNX1,ENSG00000159216,K562,0.352798
