# TF2DNA Transcription Factor Target Library

Author: Zachary Flamholz  
Date: 07-2018  
Database: http://fiserlab.org/tf2dna_db//index.html  
Data: http://fiserlab.org/tf2dna_db//downloads.html    

In [1]:
import pandas as pd
import numpy as np
import glob
import sys
import datetime

# Versions of Modules in Use

In [2]:
%load_ext version_information
%version_information numpy, pandas

Software,Version
Python,3.6.5 64bit [GCC 4.2.1 Compatible Apple LLVM 9.1.0 (clang-902.0.39.1)]
IPython,6.4.0
OS,Darwin 17.5.0 x86_64 i386 64bit
numpy,1.14.5
pandas,0.23.1
Wed Jul 18 13:05:33 2018 EDT,Wed Jul 18 13:05:33 2018 EDT


## read in data, different experiments are different files

In [3]:
path = 'input/TF2DNA_yeast/*'
folders = glob.glob(path)
names = []
dfs = []
for folder in folders:
    path2 = folder + '/*.pscan'
    files = glob.glob(path2)
    
    for file in files:
        
        dirs = file.split('/')

        desc = dirs[2].split('_')
        
        exper = dirs[3].split('.')
        
        ## the name has the form TF_PMID, this becomes the term name
        names.append(exper[0] + '_' + desc[3])
        
        dfs.append(pd.read_csv(file, sep='\t', header=0))

In [4]:
dfs[0].head()

Unnamed: 0,tf_name,target_name,start_position,end_position,direction,binding_score,p_value,binding_sites
0,CUP9,YLR157W-A,475264,475814,(+),937.45,0.000358,475335:937.45:0.000358
1,CUP9,YIL067C,237707,238257,(-),1026.57,0.000196,238167:1026.57:0.000196
2,CUP9,IPL1,157542,158092,(-),937.45,0.000358,157587:937.45:0.000358
3,CUP9,AIM6,30156,30706,(+),861.06,0.000553,30627:861.06:0.000553
4,CUP9,IMA5,26036,26586,(-),937.45,0.000358,26424:937.45:0.000358


In [74]:
names[0]

'CUP9_PMID-19111667'

In [75]:
for exper in dfs:
    print(len(exper))

2469
1483
1129
1604
3363
2460
1598
1556
2322
1409
1277
1813
1698
1771
2137
1983
1595
3680
2236
1807
4345
1937
1913
1443
1989
3819
1411
1969
1963
2102
2260
1689
3112
2016
2253
1778
2005
1992
1207
1920
2967
1900
1488
1976
1450
1934
3014
2428
2083
1999
2989
1674
3943
1493
2122
1972
1486
3167
2601
1965
1681
1578
1868
1753
2017
3964
2617
1891
1595
2025
1311
2121
2013
2161
1517
1221
2981
1644
2933
4106
1982
3610
500
2030
2325
2051
1928
1891
1806
1491
1675
2476
2397
4000
5362
2094
2882
2113
1214
3885
1797
1731
1882
2520
1201
1535
2044
2943
1795
1177
1665
435
2481
1775
1642
1886
1730
294
2167
3912
2399
619
1430
3075
1963
1889
4035
1694
1764
3247
0
1843
1387
210
2184
3875
4902
2093
2290
1608
4787
4287
1970
1612
2411
1380
3552
1347
2648
2054
1813
1516
2281
4032
1468
1418
1768
1768
1413
3116
3094
1992
1105
3551
2007
2015
1834
2536
1178
2786
1312
1350
1333
2900
1794
1387
1428
1861
1709
3741
3362
3675
3223
2321
1752
2858
1459
1283
2110
1640
5031
1493
4450
2168
1078
2771
2250
2140
2248
3487
2384
519

In [76]:
np.sort(np.array(dfs[0]['p_value']))[2000]

0.000725

## write gmt file from data

In [77]:
## writing gmt files
filename = 'TF2DNA_yeast_%s.gmt'% str(datetime.date.today())[0:7].replace('-', '_')
file = open(filename,'w+')
lengths = []
lengths_kept = []
for i,exper in enumerate(dfs):
    
    if len(exper) > 2000:
        val = np.sort(np.array(exper['p_value']))[2000]
        test = exper[exper['p_value'] < val]

        ## sometimes a lot of targets have the same binding score so the number of genes gets very low
        if (len(test) < 100):
            exper = exper[exper['p_value'] < val + .00001]
        else:
            exper = test
            
    lengths.append(len(exper))
    
    ## discard experiments with too many or too few targets
    if len(exper) > 2999:
        continue
        
    if len(exper) < 5:
        continue
    
    lengths_kept.append(len(exper))
    file.write("%s" % names[i] + '\t')
    file.write("\t")
    genes = exper['target_name'].unique()
    for gene in genes:
        file.write("%s\t" % gene)
    file.write("\n")

file.close()

In [78]:
lengths = np.array(lengths)
lengths_kept = np.array(lengths_kept)

In [79]:
lengths.max()

2354

In [80]:
lengths_kept.max()

2354

In [81]:
len(lengths) - len(lengths_kept)

11