In [1]:
import sys
sys.path.insert(0,'../../../')
from lib.data_processing import GenNLPMaskedDataset
from transformers import ElectraForMaskedLM, ElectraTokenizer, ElectraConfig, Trainer, TrainingArguments, EvalPrediction
import torch
import numpy as np
from sklearn.metrics import r2_score
from torch.nn import functional as F
from lib.utils import general as g
from lib.config.config_class import page_config
import json
import os

# Config

In [2]:
config = None
with g.reading('./config.json') as cf:
    config = json.load(cf)
assert config is not None, "config can't none"

In [3]:
def r2_score_transformers(eval_prediction: EvalPrediction)->dict:
    label_ids = eval_prediction.label_ids
    logits = eval_prediction.predictions
    softmax = F.softmax(torch.as_tensor(logits),dim=-1)
    top_word = torch.argmax(softmax,dim=-1)
    return {
        'r2 score VS':r2_score(label_ids.T,top_word.T),
        'r2 score SV':r2_score(label_ids,top_word)
    }

In [4]:
regions = [0,1,2,3,4,5,6,7,8,9,10
# ,11,12
]
batchs = [0]
region_paths = page_config.get_file_paths(config[page_config.file_page_prefix],page_config.page,regions,batchs)

In [5]:
training_args = TrainingArguments(**config[page_config.train_args])
modeling_args = ElectraConfig(**config[page_config.model_args])
tokenizer = ElectraTokenizer(vocab_file=config[page_config.vocab_file])
seed = training_args.seed

In [6]:
output_dir = training_args.output_dir
logging_dir = training_args.logging_dir
for i, region in enumerate(regions):
    batch_paths = region_paths[i]
    test_dataset = GenNLPMaskedDataset(batch_paths[:],tokenizer,seed=seed,masked_by_flag=True,only_input=True)
    modeling_args.vocab_size = tokenizer.vocab_size
    modeling_args.max_position_embeddings = test_dataset.max_position_embeddings()
    electra_model = ElectraForMaskedLM.from_pretrained(config[page_config.save_dir].format(region))
    training_args.output_dir = output_dir.format(region)
    training_args.logging_dir = logging_dir.format(region)
    trainer = Trainer(
        model = electra_model,
        args=training_args,
        # train_dataset = train_dataset,
        # eval_dataset = eval_dataset,
        compute_metrics = r2_score_transformers,
    )
    eval_test = trainer.evaluate(test_dataset)
    break


preprocess data from document:   0%|          | 0/1 [00:00<?, ?it/s]

***** Running Evaluation *****
  Num examples = 200
  Batch size = 16


In [12]:
import pandas as pd
from lib.data_processing import process_input as pi

In [43]:
temp = pd.read_csv('/client/user1/cuongdev/GenImputation/data/test/electra_G1K_22_hs37d5/corpus_dir/G1K_22_hs37d5_biallelic_test.r0001.b0000.variant.gz')

In [14]:
manifest_file = '/client/user1/cuongdev/GenImputation/data/raw/infiniumomni2-5-8v1-5-a1.csv.gz'
chroms=['22']
hg_refgenome='/client/user1/cuongdev/GenImputation/data/raw/hg19.fa.gz'
marker = pi.parse_manifest(manifest_file,chroms,hg_refgenome)

create marker from manifest: 0it [00:00, ?it/s]

Create marker done!


In [44]:
m = list(map(lambda x: str(x) in marker['22'],temp['POS'].values))

In [46]:
temp['POS'].values[38]

16223201

In [42]:
temp['POS'].values[np.where(temp['POS'].values > int(pos[0]))[0]]

array([16114253, 16114258, 16114297, ..., 16220704, 16220705, 16220993])

In [52]:
paper = pd.read_csv('/client/user1/cuongdev/GenImputation/rnnimp/results/chr22.gen',sep=' ',header=None)

In [55]:
paper[2].values[-1]

51224208

In [56]:
from lib.genhelper import vcf_helper as vhelper

In [57]:
zarr_path = vhelper.vcf_to_zarr('../G1K_22_hs37d5_biallelic_test.vcf.gz',in_zarr_folder=False)

In [58]:
import zarr

In [59]:
callset = zarr.open_group(zarr_path)

In [65]:
np.where(callset.variants.POS[:] < 51224208)[0].shape

(1109935,)

In [None]:
1109935

In [66]:
paper.shape

(40249, 305)

In [68]:
1109935/40249

27.576709980372183