In [1]:
import wandb
import anndata
import pandas as pd
import numpy as np

In [2]:
project_name = 'human-chromhmm-fullstack'

In [3]:
wandb.login(host="https://api.wandb.ai")

[34m[1mwandb[0m: Currently logged in as: [33mavantikalal[0m ([33mgrelu[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
run = wandb.init(
    entity='grelu', project=project_name, job_type='preprocessing', name='prep'
)

## Load data

In [5]:
chromhmm = pd.read_table('https://public.hoffman2.idre.ucla.edu/ernst/2K9RS//full_stack/full_stack_annotation_public_release/hg38/hg38_genome_100_segments.bed.gz', header=None)
chromhmm.columns = ['chrom', 'start', 'end', 'state']
chromhmm.head()

Unnamed: 0,chrom,start,end,state
0,chr1,10000,10400,2_GapArtf2
1,chr1,10400,10600,27_Acet1
2,chr1,10600,10800,38_EnhWk4
3,chr1,10800,12800,1_GapArtf1
4,chr1,12800,13000,38_EnhWk4


## Process data

In [6]:
from grelu.data.preprocess import filter_chromosomes, filter_blacklist
from grelu.sequence.utils import resize

chromhmm = filter_chromosomes(chromhmm, include='autosomes')
chromhmm = resize(chromhmm, 1024)
chromhmm = filter_blacklist(chromhmm, 'hg38')

  from .autonotebook import tqdm as notebook_tqdm


Keeping 5845850 intervals
Keeping 5809104 intervals


In [7]:
print(chromhmm.state.value_counts()) 

state
6_Quies3      640067
4_Quies1      317364
29_Acet3      284771
5_Quies2      282066
21_ReprPC4    246809
               ...  
8_Quies5        6722
84_TxEx4        6660
25_ReprPC8      2007
2_GapArtf2      1301
3_GapArtf3       318
Name: count, Length: 100, dtype: int64


In [8]:
chromhmm[chromhmm.state.str.contains("_Enh")].state.value_counts()

state
38_EnhWk4    140009
35_EnhWk1    112640
42_EnhWk8     85187
37_EnhWk3     66166
47_EnhA5      63553
39_EnhWk5     54250
57_EnhA15     53214
55_EnhA13     47641
59_EnhA17     43689
58_EnhA16     40374
40_EnhWk6     35144
53_EnhA11     34875
60_EnhA18     34028
52_EnhA10     33755
48_EnhA6      32004
49_EnhA7      30153
44_EnhA2      27637
36_EnhWk2     27069
54_EnhA12     26953
46_EnhA4      25419
56_EnhA14     24511
41_EnhWk7     22648
61_EnhA19     21794
62_EnhA20     19188
45_EnhA3      17139
50_EnhA8      13773
43_EnhA1      13072
51_EnhA9      11022
Name: count, dtype: int64

In [9]:
chromhmm[chromhmm.state.str.contains("_EnhA")].state.value_counts()

state
47_EnhA5     63553
57_EnhA15    53214
55_EnhA13    47641
59_EnhA17    43689
58_EnhA16    40374
53_EnhA11    34875
60_EnhA18    34028
52_EnhA10    33755
48_EnhA6     32004
49_EnhA7     30153
44_EnhA2     27637
54_EnhA12    26953
46_EnhA4     25419
56_EnhA14    24511
61_EnhA19    21794
62_EnhA20    19188
45_EnhA3     17139
50_EnhA8     13773
43_EnhA1     13072
51_EnhA9     11022
Name: count, dtype: int64

## Get coarse-grained state labels

In [10]:
chromhmm['state'] = [
    x.split('_')[1][:-1] for x in chromhmm.state
]

chromhmm.loc[chromhmm.state.isin(['EnhA1', 'EnhA2']), 'state'] = 'EnhA'

chromhmm['state'] = chromhmm['state'].astype('category')
chromhmm.state.value_counts()  

state
Quies      1485576
Acet        639669
EnhA        613794
ReprPC      610147
Tx          561526
EnhWk       543113
HET         521161
TxWk        254518
TxEnh       190465
TxEx        121833
PromF        88429
GapArtf      51474
BivProm      48242
znf          34146
TSS          24402
DNase        20609
Name: count, dtype: int64

## Save dataset

In [11]:
chromhmm.to_csv('chromhmm.csv.gz', index=False) 

In [12]:
artifact = wandb.Artifact('dataset', type='dataset')
artifact.add_file('chromhmm.csv.gz')
run.log_artifact(artifact)
run.finish()