# Process and save Enformer genomic intervals

## Set up wandb

In [1]:
import wandb
import pandas as pd

wandb.login(host="https://api.wandb.ai")
project_name='enformer'

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mavantikalal[0m ([33mgrelu[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
run = wandb.init(entity='grelu', project=project_name, job_type='preprocessing', name='prep-intervals-human',
    settings=wandb.Settings(
        program_relpath='data_human.ipynb',
        program_abspath='/code/github/gReLU-applications/enformer/data_human.ipynb'
    ))

## Load intervals

In [3]:
sequences_path = '/gstore/data/resbioai/grelu/enformer/sequences.bed'

In [4]:
intervals = pd.read_table(sequences_path, header=None)
intervals.columns = ['chrom', 'start', 'end', 'split']
intervals.head(3)

Unnamed: 0,chrom,start,end,split
0,chr18,928386,1059458,train
1,chr4,113630947,113762019,train
2,chr11,18427720,18558792,train


## Resize intervals

In [5]:
from grelu.sequence.utils import resize
intervals = resize(intervals, 196608)
intervals.head()

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,chrom,start,end,split
0,chr18,895618,1092226,train
1,chr4,113598179,113794787,train
2,chr11,18394952,18591560,train
3,chr16,85772913,85969521,train
4,chr3,158353420,158550028,train


## Save

In [6]:
intervals.to_csv("human_intervals.tsv", index=False, sep="\t")

In [7]:
artifact = wandb.Artifact('human_intervals', type='dataset')
artifact.add_file(local_path="human_intervals.tsv", name="data.tsv")
run.log_artifact(artifact)

<Artifact human_intervals>

In [8]:
run.finish()