In [1]:
from pybedtools import BedTool, example_filename
import math
import numpy as np
from Bio import  SeqIO
from refgene_parser import RefGene

In [2]:
def entropy(seq):
    nseq = np.array(list(seq))
    e    = []
    for i in np.unique(nseq, return_counts=True)[1]:
        p = float(i)/len(seq)
        e.append(p * np.log10(p))
    return -np.sum(e)

print(entropy("TTTT"))
print(entropy("ACGT"))

-0.0
0.602059991328


In [3]:
def sequence(chrom,start,end):
    hg19 = "/home/sacha/Linux_data/bioinfo/data/hg19.fa"
    a = BedTool(" ".join((chrom,str(start),str(end))), from_string=True)
    a.sequence(fi=hg19)
    record = SeqIO.read(a.seqfn,"fasta")
    return record.seq

print(entropy(sequence("chr1",20767600,20767604)))

0.244219050288


In [10]:
def compute_gene_entropy(gene, window_size=50):
    refgene = RefGene("/home/sacha/Linux_data/bioinfo/data/ucsc/refGene.txt.gz")
    gene = refgene.gene_by_name(gene)
    
    # Marge de windows_size * 10 
    start = gene.start - window_size * 200
    end   = gene.end   + window_size * 200
    
    for pos in range(start,end):
        
        e = entropy(sequence(gene.chrom,pos,pos+window_size))
        
        print(gene.chrom,pos, pos+window_size, e, sep="\t")
    

In [None]:
compute_gene_entropy("WNT2",window_size=50)

chr7	116906685	116906735	0.571874248849
chr7	116906686	116906736	0.571874248849
chr7	116906687	116906737	0.571874248849
chr7	116906688	116906738	0.564776721376
chr7	116906689	116906739	0.557284244671
chr7	116906690	116906740	0.564776721376
chr7	116906691	116906741	0.571874248849
chr7	116906692	116906742	0.570037111894
chr7	116906693	116906743	0.567111393476
chr7	116906694	116906744	0.562149860715
chr7	116906695	116906745	0.567172857922
chr7	116906696	116906746	0.562149860715
chr7	116906697	116906747	0.556713093259
chr7	116906698	116906748	0.550090658987
chr7	116906699	116906749	0.550090658987
chr7	116906700	116906750	0.555922375677
chr7	116906701	116906751	0.566320675894
chr7	116906702	116906752	0.559223148422
chr7	116906703	116906753	0.559223148422
chr7	116906704	116906754	0.550763577196
chr7	116906705	116906755	0.546322476964
chr7	116906706	116906756	0.555870629654
chr7	116906707	116906757	0.555870629654
chr7	116906708	116906758	0.550763577196
chr7	116906709	116906759	0.540837658059


chr7	116906904	116906954	0.549453658489
chr7	116906905	116906955	0.54036096752
chr7	116906906	116906956	0.529356487229
chr7	116906907	116906957	0.528464648231
chr7	116906908	116906958	0.514862973585
chr7	116906909	116906959	0.523472866597
chr7	116906910	116906960	0.507281565614
chr7	116906911	116906961	0.509464922052
chr7	116906912	116906962	0.523472866597
chr7	116906913	116906963	0.526676241026
chr7	116906914	116906964	0.534774904289
chr7	116906915	116906965	0.526676241026
chr7	116906916	116906966	0.526676241026
chr7	116906917	116906967	0.526676241026
chr7	116906918	116906968	0.523981654838
chr7	116906919	116906969	0.536229330232
chr7	116906920	116906970	0.532991521959
chr7	116906921	116906971	0.523981654838
chr7	116906922	116906972	0.531537096016
chr7	116906923	116906973	0.54378477141
chr7	116906924	116906974	0.554183071627
chr7	116906925	116906975	0.556612160417
chr7	116906926	116906976	0.555367063656
chr7	116906927	116906977	0.560588826609
chr7	116906928	116906978	0.564264006034
ch

chr7	116907129	116907179	0.545727522652
chr7	116907130	116907180	0.541085248705
chr7	116907131	116907181	0.541085248705
chr7	116907132	116907182	0.541085248705
chr7	116907133	116907183	0.54378477141
chr7	116907134	116907184	0.54378477141
chr7	116907135	116907185	0.552937974866
chr7	116907136	116907186	0.54951562682
chr7	116907137	116907187	0.545077380203
chr7	116907138	116907188	0.553536951429
chr7	116907139	116907189	0.553536951429
chr7	116907140	116907190	0.54951562682
chr7	116907141	116907191	0.53996747413
chr7	116907142	116907192	0.54951562682
chr7	116907143	116907193	0.54951562682
chr7	116907144	116907194	0.54951562682
chr7	116907145	116907195	0.552937974866
chr7	116907146	116907196	0.562922504833
chr7	116907147	116907197	0.564917118003
chr7	116907148	116907198	0.558063551501
chr7	116907149	116907199	0.548513497154
chr7	116907150	116907200	0.54662763045
chr7	116907151	116907201	0.54662763045
chr7	116907152	116907202	0.537074541244
chr7	116907153	116907203	0.54581397445
chr7	116907

chr7	116907361	116907411	0.578735936172
chr7	116907362	116907412	0.582384296424
chr7	116907363	116907413	0.578735936172
chr7	116907364	116907414	0.578735936172
chr7	116907365	116907415	0.580129406246
chr7	116907366	116907416	0.580129406246
chr7	116907367	116907417	0.573774403411
chr7	116907368	116907418	0.565344025297
chr7	116907369	116907419	0.565344025297
chr7	116907370	116907420	0.573774403411
chr7	116907371	116907421	0.575408344253
chr7	116907372	116907422	0.575408344253
chr7	116907373	116907423	0.565344025297
chr7	116907374	116907424	0.573774403411
chr7	116907375	116907425	0.573774403411
chr7	116907376	116907426	0.573774403411
chr7	116907377	116907427	0.571036715277
chr7	116907378	116907428	0.571036715277
chr7	116907379	116907429	0.574443209253
chr7	116907380	116907430	0.574443209253
chr7	116907381	116907431	0.574443209253
chr7	116907382	116907432	0.571036715277
chr7	116907383	116907433	0.575998248037
chr7	116907384	116907434	0.578735936172
chr7	116907385	116907435	0.580369877014


chr7	116907592	116907642	0.55782824109
chr7	116907593	116907643	0.553030350573
chr7	116907594	116907644	0.54581397445
chr7	116907595	116907645	0.537074541244
chr7	116907596	116907646	0.528464648231
chr7	116907597	116907647	0.529356487229
chr7	116907598	116907648	0.528464648231
chr7	116907599	116907649	0.517917531318
chr7	116907600	116907650	0.517917531318
chr7	116907601	116907651	0.51661192207
chr7	116907602	116907652	0.504315856672
chr7	116907603	116907653	0.504315856672
chr7	116907604	116907654	0.504315856672
chr7	116907605	116907655	0.502615298191
chr7	116907606	116907656	0.502615298191
chr7	116907607	116907657	0.502615298191
chr7	116907608	116907658	0.504315856672
chr7	116907609	116907659	0.491562427896
chr7	116907610	116907660	0.505164102542
chr7	116907611	116907661	0.505164102542
chr7	116907612	116907662	0.501220476953
chr7	116907613	116907663	0.51120500692
chr7	116907614	116907664	0.492870976854
chr7	116907615	116907665	0.492870976854
chr7	116907616	116907666	0.478731504363
chr7

chr7	116907825	116907875	0.589906668768
chr7	116907826	116907876	0.591107107111
chr7	116907827	116907877	0.591107107111
chr7	116907828	116907878	0.591728056448
chr7	116907829	116907879	0.591728056448
chr7	116907830	116907880	0.591728056448
chr7	116907831	116907881	0.591107107111
chr7	116907832	116907882	0.589817351932
chr7	116907833	116907883	0.587223443516
chr7	116907834	116907884	0.591865717462
chr7	116907835	116907885	0.594919947552
chr7	116907836	116907886	0.597513855968
chr7	116907837	116907887	0.593259187536
chr7	116907838	116907888	0.593259187536
chr7	116907839	116907889	0.594459625879
chr7	116907840	116907890	0.595749381057
chr7	116907841	116907891	0.595749381057
chr7	116907842	116907892	0.598803611147
chr7	116907843	116907893	0.597513855968
chr7	116907844	116907894	0.598134805305
chr7	116907845	116907895	0.598134805305
chr7	116907846	116907896	0.598134805305
chr7	116907847	116907897	0.60031899296
chr7	116907848	116907898	0.59830457355
chr7	116907849	116907899	0.60031899296
chr