In [None]:
!pip install pytorch-lightning==1.9.2 transformers torchmetrics deepspeed nltk wandb datasets


In [None]:
!git clone https://github.com/Myashka/CQA_RLHF.git cqa_v2

In [None]:
if 'google.colab' in str(get_ipython()):
    # !pip install cloud-tpu-client==0.10 torch==1.13.1 https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-1.13-cp38-cp38-linux_x86_64.whl tensorboard-plugin-profile
    # !pip install google-api-python-client==1.12.1
    from google.colab import drive
    drive.mount('/content/drive')

In [None]:
import yaml

In [None]:
import wandb
run = wandb.init()
artifact = run.use_artifact('myashka/CQA_RLHF/model-34980p14:v4', type='model')
artifact_dir = artifact.download()

## Run train

In [None]:
trainer_config = dict(
    seed=42,
    model_name="EleutherAI/gpt-neo-125m",
    wandb=dict(
        api="60fce56bfaec85b8d6bc78bfac2086891f5afe54",
        project_name="CQA_RLHF",
        args=dict(job_type="train", group="sft", name="125M-lr_2e_5-1_ws-api_usage-freezed_3"),
    ),
    data=dict(
        data_dir=r"/kaggle/input/python-qa-api-usage/1.0-data-div-ans-sep-api-usage.json",
        max_length=512,
        batch_size=8,
        truncate_promt=True,
        on_tpu = True,
    ),
    model_params=dict(
        learning_rate=2e-5,
        use_cache=False,
        warmup_steps_per_cent=0.01,
        adam_betas=[0.9, 0.95],
        weight_decay=0.001,
        do_compute_metrics = True,
        freeze_emb = True,
        freeze_ln = False,
        freeze_attn = False,
        freeze_ff = True,
        freeze_other = True,
        layers_not_to_freeze = [0, 11],
    ),
    trainer=dict(
        checkpoint=dict(
            every_n_train_steps=1000,
            dirpath=r"/kaggle/working/Checkpoints",
            log_obg = 'val_loss',
            mode = 'min',

        ),
        ckpt_path=None,
        params=dict(
            accelerator="tpu",
            max_epochs=50,
            accumulate_grad_batches=1,
            gradient_clip_val=1,
            precision="16",
            val_check_interval=104,
            overfit_batches=0,  # 0 for train
            num_sanity_val_steps=2,
            log_every_n_steps = 20,
#             limit_train_batches=100,
#             strategy="deepspeed_stage_2_offload"
        ),
    ),
)

with open("trainer_config.yaml", "w") as outfile:
    yaml.dump(trainer_config, outfile, default_flow_style=False)

In [None]:
!python /kaggle/working/cqa_v2/sft/train_sft.py --config_file /kaggle/working/trainer_config.yaml

## Run test

In [None]:
test_config = dict(
    seed=42,
    cuda=True,
    model_name="EleutherAI/gpt-neo-125m",
    test_params=dict(
        save_steps=100,
        do_compute_metrics=True,
        test_model_path=r'/content/artifacts/model-34980p14:v4/model.ckpt',
        log_file='/content/test-tuned-api_usage-answer_loss-bredogenerated-125M.csv',
        use_cache=True,
    ),
    wandb=dict(
        api="60fce56bfaec85b8d6bc78bfac2086891f5afe54",
        args=dict(group="sft", job_type="test",
                  name="125M-tuned-test-api_usage-answer_loss-bredogen"),
        project_name="CQA_RLHF",
    ),
    data=dict(
        data_dir=r"/content/drive/MyDrive/Colab Notebooks/vkr_data/data/1.0-data-div-ans-sep-api-usage.json",
        max_length=512,
        truncate_promt=True,
        split='test',
        padding='max_length',
        padding_side='right'
    ),
    generate_params=dict(
        do_sample=True,
        top_k=50,
        top_p=0.9,
        # temperature=0,
        # num_return_sequences=0,
        no_repeat_ngram_size=2,
        max_new_tokens=512,
        # min_new_tokens=64,
    ),
)

with open("test_config.yaml", "w") as outfile:
    yaml.dump(test_config, outfile, default_flow_style=False)


In [None]:
!python /content/cqa_v2/sft/test_sft.py --config_file /content/test_config.yaml

In [None]:
import torch
import pytorch_lightning as pl
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
)

In [None]:
state_dict = torch.load(r'/content/artifacts/model-34980p14:v4/model.ckpt')

In [None]:
from transformers import GPTNeoForSequenceClassification

In [None]:
model = AutoModelForCausalLM.from_pretrained(
            'EleutherAI/gpt-neo-125M', use_cache=False
        )

tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neo-125M')

In [None]:
!pip install huggingface_hub

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
model.push_to_hub("Myashka/125M_GPTneo_sft_tuned")

In [None]:
model.save_pretrained(r'/content/reward_base.json')

In [None]:
model.resize_token_embeddings(len(tokenizer))
tokenizer.pad_token = tokenizer.eos_token
model.config.end_token_id = tokenizer.eos_token_id
model.config.pad_token_id = model.config.eos_token_id
model.pad_token_id = tokenizer.eos_token_id

In [None]:
new_state_dict = {}
for key in state_dict['state_dict']:
    new_key = key.replace("model.", "")
    new_state_dict[new_key] = state_dict['state_dict'][key]

In [None]:
model.load_state_dict(new_state_dict)

In [None]:
device = torch.device("cuda")

In [None]:
model = model.to(device)

In [None]:
question = """Question\nI have clustered the pixels of an image into clusters of different sizes and shapes. I want to max pool each cluster as fast as possible because the max pooling happens in one layer of my CNN. To clarify: Input is a batch of images with the following shape [batch_size, height of image, width of image, number of channels]. I have clustered each image before I start training my CNN. So for each image I have a ndarray of labels with shape [height of image, width of image]. How can I max pool over all pixels of an image that have the same label for all labels? I understand how to do it with a of for loop but that is painstakingly slow. I am searching for a fast solution that ideally can max pool over every cluster of each image in less than a second. For implementation, I use Python3.7 and PyTorch."""

In [None]:
inputs = tokenizer(question, return_tensors="pt", truncation=True,
                max_length=512, padding='max_length').to(device)

In [None]:
import torch.nn.functional as F

In [None]:
import matplotlib.pyplot as plt

In [None]:
model.eval()
output_1 = F.softmax(model(**inputs).logits[0], 1)[-1]

In [None]:
top_k = torch.topk(output_1.to('cpu'), k=10)
top_k_probs = top_k.values.detach().numpy()
top_k_tokens = tokenizer.convert_ids_to_tokens(top_k.indices.numpy())

# Визуализация топ-10 токенов и их вероятностей
plt.bar(top_k_tokens, top_k_probs)
plt.xlabel("Tokens")
plt.ylabel("Probability")
plt.title("Top-10 Tokens Probability")
plt.show()

In [None]:
def generate(text: str, device, **kwargs):
    model.eval()
    inputs = tokenizer(text+'/nAnswer: ', return_tensors="pt",
                       truncation=True,
                       max_length=512, padding='max_length')
    inputs = inputs.to(device)
    generated_tokens = model.generate(inputs.input_ids, **kwargs)
    generated_q_a = tokenizer.decode(
        generated_tokens[0], skip_special_tokens=True
    )
    return generated_q_a

In [None]:
generate(question, device, **dict(
        do_sample=True,
        top_k=50,
        top_p=0.9,
        # temperature=0.9,
        no_repeat_ngram_size=2,
        max_new_tokens=512,
    ))

In [None]:
generate("""If I'm a woman you are my daughter, whom I could be?""", device, **dict(
        do_sample=True,
        top_k=50,
        top_p=0.9,
        no_repeat_ngram_size=2,
        max_length=1024,
    ))

In [None]:
import sys
sys.path.append(r'/content/cqa/sft')

In [None]:
from data_module import QADataModule

In [None]:
dm = QADataModule('EleutherAI/gpt-neo-125M', "/content/drive/MyDrive/Colab Notebooks/vkr_data/data/1.0-data-div-ans-sep.json", 512, 8, True, zero_question_labels=False)

In [None]:
dm.setup('fit')

In [None]:
next(iter(dm.train_dataloader()))['attention_mask']

In [None]:
next(iter(dm.train_dataloader()))['labels']