# Process and save Enformer genomic intervals

## Set up wandb

In [1]:
import wandb
import pandas as pd

wandb.login(host="https://api.wandb.ai")
project_name='enformer'

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mavantikalal[0m ([33mgrelu[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
run = wandb.init(entity='grelu', project=project_name, job_type='preprocessing', name='prep-intervals-mouse',
    settings=wandb.Settings(
        program_relpath='data_mouse.ipynb',
        program_abspath='/code/github/gReLU-applications/enformer/data_mouse.ipynb'
    ))

## Load intervals

In [3]:
sequences_path = '/gstore/data/resbioai/grelu/enformer/sequences-mouse.bed'

In [4]:
intervals = pd.read_table(sequences_path, header=None)
intervals.columns = ['chrom', 'start', 'end', 'split']
intervals.head(3)

Unnamed: 0,chrom,start,end,split
0,chr4,34106647,34237719,train
1,chr5,52207747,52338819,train
2,chr19,20136862,20267934,train


## Resize intervals

In [5]:
from grelu.sequence.utils import resize
intervals = resize(intervals, 196608)
intervals.head()

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,chrom,start,end,split
0,chr4,34073879,34270487,train
1,chr5,52174979,52371587,train
2,chr19,20104094,20300702,train
3,chr14,61812671,62009279,train
4,chr15,6559578,6756186,train


## Save

In [6]:
intervals.to_csv("mouse_intervals.tsv", index=False, sep="\t")

In [7]:
artifact = wandb.Artifact('mouse_intervals', type='dataset')
artifact.add_file(local_path="mouse_intervals.tsv", name="data.tsv")
run.log_artifact(artifact)

<Artifact mouse_intervals>

In [8]:
run.finish()