# TF2DNA Transcription Factor Target Library

Author: Zachary Flamholz  
Date: 07-2018  
Database: http://fiserlab.org/tf2dna_db//index.html   
Data: http://fiserlab.org/tf2dna_db//downloads.html  

In [2]:
import pandas as pd
import numpy as np
import glob
import sys
import datetime

# Versions of Modules in Use

In [3]:
%load_ext version_information
%version_information numpy, pandas

Software,Version
Python,3.6.5 64bit [GCC 4.2.1 Compatible Apple LLVM 9.1.0 (clang-902.0.39.1)]
IPython,6.4.0
OS,Darwin 17.5.0 x86_64 i386 64bit
numpy,1.14.5
pandas,0.23.1
Wed Jul 18 13:03:00 2018 EDT,Wed Jul 18 13:03:00 2018 EDT


## read in data, different experiments are different files

In [4]:
path = 'input/TF2DNA_fly/*'
folders = glob.glob(path)
names = []
dfs = []
for folder in folders:
    path2 = folder + '/*.pscan'
    files = glob.glob(path2)
    
    for file in files:
        
        dirs = file.split('/')

        desc = dirs[2].split('_')
        
        exper = dirs[3].split('.')
        
        ## the name has the form TF_PMID, this becomes the term name
        names.append(exper[0] + '_' + desc[3])
        
        dfs.append(pd.read_csv(file, sep='\t', header=0))

In [5]:
dfs[0].head()

Unnamed: 0,tf_name,target_name,start_position,end_position,direction,binding_score,p_value,binding_sites
0,bap,CG5895,16028334,16028634,(+),1308.09,2.6e-05,16028340:1308.09:0.000026
1,bap,CG8042,7968041,7968341,(-),1155.31,0.00011,7968118:1155.31:0.000110
2,bap,CG7536,17979445,17979745,(-),849.76,0.000369,17979568:849.76:0.000369 17979680:722.44:0.000573
3,bap,CG7220,6618589,6618889,(+),747.91,0.000515,6618604:747.91:0.000515
4,bap,Mulk,10269800,10270100,(+),798.83,0.000442,10269942:798.83:0.000442


In [6]:
names[0]

'bap_PMID-25215497'

In [38]:
for exper in dfs:
    print(len(exper))

4866
5085
2424
3752
6459
6854
6094
3515
6907
2662
6823
5684
6411
2757
8349
11800
3750
3828
5541
9427
3052
3581
5744
2128
5559
3344
5135
8842
10769
7433
5168
6192
5184
4911
4246
5362
3153
2980
5330
4696
8853
4119
4110
5659
3243
5899
6394
5166
7742
3540
6375
2859
3884
5744
9269
6461
4098
782
3984
2932
5523
7614
5315
3034
6059
4774
6144
3392
6209
5127
4579
5070
3110
2979
4321
7273
4398
4971
7748
3134
3347
2878
4655
8295
4143
3590
3043
6349
7234
6339
2981
5699
5764
6108
7896
3557
6339
3822
6339
5764
7171
3258
5710
7836
5764
5565
4633
3707
3657
5340
3705
3702
4897
3654
3896
6339
4401
5037
3884
6051
5271
6287
6520
1451
2967
10515
2355
4718
3334
4083
7326
5434
219
5864
5225
6035
4336
5074
7022
6554
4557
4633
2755
5178
4547
6843
782
6197
4492
5767
9564
6252
4671
5033
5504
5561
8047
2968
5290
3691
5831
7614
6127
6166
5059
6207
3967
3031
7634
3982
5530
3590
3514
3183
3954
8156
5376
3124
7447
8288
2646
4299
4989
3188
4121
5069
5188
6629
5037
7086
6996
5144
4082
4157
11178
3104
4005
4125
6353
4924

In [39]:
np.sort(np.array(dfs[1]['p_value']))[2000]

0.000367

## write gmt file from data

In [40]:
## writing gmt files
filename = 'TF2DNA_fly_%s.gmt'% str(datetime.date.today())[0:7].replace('-', '_')
file = open(filename,'w+')
lengths = []
lengths_kept = []
for i,exper in enumerate(dfs):
    
    if len(exper) > 2000:
        val = np.sort(np.array(exper['p_value']))[2000]
        test = exper[exper['p_value'] < val]

        ## sometimes a lot of targets have the same binding score so the number of genes gets very low
        if (len(test) < 100):
            exper = exper[exper['p_value'] < val + .00001]
        else:
            exper = test
            
    lengths.append(len(exper))
    
    ## discard experiments with too many or too few targets
    if len(exper) > 2999:
        continue
        
    if len(exper) < 5:
        continue
    
    lengths_kept.append(len(exper))
    file.write("%s" % names[i] + '\t')
    file.write("\t")
    genes = exper['target_name'].unique()
    for gene in genes:
        file.write("%s\t" % gene)
    file.write("\n")

file.close()

In [41]:
lengths = np.array(lengths)
lengths_kept = np.array(lengths_kept)

In [42]:
lengths.max()

5058

In [43]:
lengths_kept.max()

2918

In [44]:
len(lengths) - len(lengths_kept)

8