# Process and save Borzoi genomic intervals

## Set up wandb

In [1]:
import wandb
import pandas as pd

wandb.login(host="https://api.wandb.ai")
project_name='borzoi'

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mavantikalal[0m ([33mgrelu[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
run = wandb.init(entity='grelu', project=project_name, job_type='preprocessing', name='prep-intervals-human',
    settings=wandb.Settings(
        program_relpath='data_human.ipynb',
        program_abspath='/code/github/gReLU-applications/borzoi/data_human.ipynb'
    ))

## Load intervals

In [3]:
intervals_path = '/gstore/data/resbioai/grelu/borzoi-data/hg38/sequences.bed'

In [4]:
intervals = pd.read_table(intervals_path, header=None)
intervals.columns = ['chrom', 'start', 'end', 'fold']
intervals.head(3)

Unnamed: 0,chrom,start,end,fold
0,chr4,82524421,82721029,fold0
1,chr13,18604798,18801406,fold0
2,chr2,189923408,190120016,fold0


In [5]:
intervals['split'] = 'train'
intervals.loc[intervals.fold=='fold3', 'split'] = 'test'
intervals.loc[intervals.fold=='fold4', 'split'] = 'val'
intervals.split.value_counts()

split
train    41699
val       6910
test      6888
Name: count, dtype: int64

## Resize intervals

In [6]:
from grelu.sequence.utils import resize
intervals = resize(intervals, 524288)
intervals.head()

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,chrom,start,end,fold,split
0,chr4,82360581,82884869,fold0,train
1,chr13,18440958,18965246,fold0,train
2,chr2,189759568,190283856,fold0,train
3,chr10,59711903,60236191,fold0,train
4,chr1,116945627,117469915,fold0,train


## Save

In [7]:
intervals.to_csv("human_intervals.tsv", index=False, sep="\t")

In [8]:
artifact = wandb.Artifact('human_intervals', type='dataset')
artifact.add_file(local_path="human_intervals.tsv", name="data.tsv")
run.log_artifact(artifact)

<Artifact human_intervals>

In [9]:
run.finish()