# TF2DNA Transcription Factor Target Library

Author: Zachary Flamholz  
Date: 07-2018  
Database: http://fiserlab.org/tf2dna_db//index.html   
Data: http://fiserlab.org/tf2dna_db//downloads.html  

In [1]:
import pandas as pd
import numpy as np
import glob
import sys
import datetime

# Versions of Modules in Use

In [2]:
%load_ext version_information
%version_information numpy, pandas

Software,Version
Python,3.6.5 64bit [GCC 4.2.1 Compatible Apple LLVM 9.1.0 (clang-902.0.39.1)]
IPython,6.4.0
OS,Darwin 17.5.0 x86_64 i386 64bit
numpy,1.14.5
pandas,0.23.1
Wed Jul 18 13:04:08 2018 EDT,Wed Jul 18 13:04:08 2018 EDT


## read in data, different experiments are different files

In [3]:
path = 'input/TF2DNA_worm/*'
folders = glob.glob(path)
names = []
dfs = []
for folder in folders:
    path2 = folder + '/*.pscan'
    files = glob.glob(path2)
    
    for file in files:
        
        dirs = file.split('/')

        desc = dirs[2].split('_')
        
        exper = dirs[3].split('.')
        
        ## the name has the form TF_PMID, this becomes the term name
        names.append(exper[0] + '_' + desc[3])
        
        dfs.append(pd.read_csv(file, sep='\t', header=0))

In [4]:
dfs[1].head()

Unnamed: 0,tf_name,target_name,start_position,end_position,direction,binding_score,p_value,binding_sites
0,ceh-22,C27D8.1,12717410,12718010,(+),962.91,0.000412,12717661:962.91:0.000412
1,ceh-22,F16F9.4,8463177,8463777,(-),886.53,0.000597,8463269:886.53:0.000597
2,ceh-22,C17A2.3,3843110,3843710,(+),1077.5,0.000187,3843543:1077.50:0.000187 3843458:899.26:0.000550
3,ceh-22,tam-1,4962880,4963480,(-),937.45,0.000484,4962958:937.45:0.000484
4,ceh-22,scl-3,13059103,13059703,(+),861.06,0.00065,13059166:861.06:0.000650


In [145]:
names[0]

'nsy-7_PMID-19204119'

In [146]:
for exper in dfs:
    print(len(exper))

7111
6960
17325
13945
8688
11420
11461
11188
6536
9953
14497
8623
7391
5170
3306
8232
8766
9934
11296
9402
10551
10260
7281
6351
9809
6812
7121
3630
14832
9089
9015
18344
9805
7048
7774
11883
16141
8188
5525
13098
8293
14797
7938
6085
13963
8894
7683
1606
4092
6140
5840
4207
5523
5213
5795
2809
7127
14899
7051
11005


In [147]:
np.sort(np.array(dfs[1]['p_value']))[2000]

0.000288

## write gmt file from data

In [148]:
## writing gmt files
filename = 'TF2DNA_worm_%s.gmt'% str(datetime.date.today())[0:7].replace('-', '_')
file = open(filename,'w+')
lengths = []
lengths_kept = []
for i,exper in enumerate(dfs):
    
    if len(exper) > 2000:
        val = np.sort(np.array(exper['p_value']))[2000]
        test = exper[exper['p_value'] < val]

        ## sometimes a lot of targets have the same binding score so the number of genes gets very low
        if (len(test) < 100):
            exper = exper[exper['p_value'] < val + .00001]
        else:
            exper = test
            
    lengths.append(len(exper))
    
    ## discard experiments with too many or too few targets
    if len(exper) > 2999:
        continue
        
    if len(exper) < 5:
        continue
    
    lengths_kept.append(len(exper))
    file.write("%s" % names[i] + '\t')
    file.write("\t")
    genes = exper['target_name'].unique()
    for gene in genes:
        file.write("%s\t" % gene)
    file.write("\n")

file.close()

In [149]:
lengths = np.array(lengths)
lengths_kept = np.array(lengths_kept)

In [150]:
lengths.max()

4193

In [151]:
lengths_kept.max()

2503

In [152]:
len(lengths) - len(lengths_kept)

1