<a href="https://colab.research.google.com/github/Kuper994/TML-project/blob/ron_branch/TML_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Env Setup

In [1]:
# !pip install -U torchtext==0.18.0


In [2]:
from google.colab import drive
import sys
import os

drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/Trutworthy-ML/Project')
sys.path.append(os.path.abspath('/content/drive/MyDrive/Trutworthy-ML/Project'))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import random
import pickle


device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

## Model llama

In [4]:
from transformers.pipelines.text_generation import ReturnType, Chat
from typing import Union, Sequence, Any


class OurPipeline(transformers.pipelines.TextGenerationPipeline):
  def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

  def _forward(self, model_inputs, **generate_kwargs):
        input_ids = model_inputs["input_ids"]
        attention_mask = model_inputs.get("attention_mask", None)
        # Allow empty prompts
        if input_ids.shape[1] == 0:
            input_ids = None
            attention_mask = None
            in_b = 1
        else:
            in_b = input_ids.shape[0]
        prompt_text = model_inputs.pop("prompt_text")

        # If there is a prefix, we may need to adjust the generation length. Do so without permanently modifying
        # generate_kwargs, as some of the parameterization may come from the initialization of the pipeline.
        prefix_length = generate_kwargs.pop("prefix_length", 0)
        if prefix_length > 0:
            has_max_new_tokens = "max_new_tokens" in generate_kwargs or (
                "generation_config" in generate_kwargs
                and generate_kwargs["generation_config"].max_new_tokens is not None
            )
            if not has_max_new_tokens:
                generate_kwargs["max_length"] = generate_kwargs.get("max_length") or self.model.config.max_length
                generate_kwargs["max_length"] += prefix_length
            has_min_new_tokens = "min_new_tokens" in generate_kwargs or (
                "generation_config" in generate_kwargs
                and generate_kwargs["generation_config"].min_new_tokens is not None
            )
            if not has_min_new_tokens and "min_length" in generate_kwargs:
                generate_kwargs["min_length"] += prefix_length

        # BS x SL
        generated_sequence = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, output_scores=True, return_dict_in_generate=True, **generate_kwargs)
        # print(generated_sequence)
        # forward_res = self.model(input_ids=input_ids, attention_mask=attention_mask)
        # out_b = generated_sequence.shape[0]
        # if self.framework == "pt":
        #     generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:])
        # elif self.framework == "tf":
        #     pass
        return {"generated_sequence": generated_sequence, "input_ids": input_ids, "prompt_text": prompt_text}  #, "forward_res": forward_res}

  def forward(self, model_inputs, **forward_params):
        with self.device_placement():
            if self.framework == "tf":
                model_inputs["training"] = False
                model_outputs = self._forward(model_inputs, **forward_params)
            elif self.framework == "pt":
                inference_context = self.get_inference_context()
                with inference_context():
                    model_inputs = self._ensure_tensor_on_device(model_inputs, device=self.device)
                    model_outputs = self._forward(model_inputs, **forward_params)
                    model_outputs = self._ensure_tensor_on_device(model_outputs, device=torch.device("cpu"))
            else:
                raise ValueError(f"Framework {self.framework} is not supported")
        return model_outputs

  def postprocess(self, model_outputs, return_type=ReturnType.FULL_TEXT, clean_up_tokenization_spaces=True):
        generated_sequence = model_outputs["generated_sequence"][0]
        input_ids = model_outputs["input_ids"]
        prompt_text = model_outputs["prompt_text"]
        generated_sequence = generated_sequence.numpy().tolist()
        records = []
        for sequence in generated_sequence:
            if return_type == ReturnType.TENSORS:
                record = {"generated_token_ids": sequence}
            elif return_type in {ReturnType.NEW_TEXT, ReturnType.FULL_TEXT}:
                # Decode text
                text = self.tokenizer.decode(
                    sequence,
                    skip_special_tokens=True,
                    clean_up_tokenization_spaces=clean_up_tokenization_spaces,
                )

                # Remove PADDING prompt of the sequence if XLNet or Transfo-XL model is used
                if input_ids is None:
                    prompt_length = 0
                else:
                    prompt_length = len(
                        self.tokenizer.decode(
                            input_ids[0],
                            skip_special_tokens=True,
                            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
                        )
                    )

                all_text = text[prompt_length:]
                if return_type == ReturnType.FULL_TEXT:
                    if isinstance(prompt_text, str):
                        all_text = prompt_text + all_text
                    elif isinstance(prompt_text, Chat):
                        # Explicit list parsing is necessary for parsing chat datasets
                        all_text = list(prompt_text.messages) + [{"role": "assistant", "content": all_text}]

                record = {"generated_text": all_text}
            records.append(record)
        return model_outputs, records



In [5]:
prefix = "The recent advances in computational biology are"
access_token = "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"
model = "meta-llama/Llama-2-7b-chat-hf"
# # model = "meta-llama/Meta-Llama-3-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model, token=access_token)

model = AutoModelForCausalLM.from_pretrained(
    model,
    token=access_token
 )



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [15]:

pipeline = transformers.pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
torch_dtype=torch.float16,
device=device,
pipeline_class=OurPipeline,
)

In [16]:
# all_words = list(tokenizer.vocab.keys())
# prompts = []
# for _ in range(200):
#   prompts.append(' '.join([w for w in random.sample(dictionary, 10)]))

# print("prompt:", prompt)
sequences = pipeline(
'How NLP models been applied in the field of biology',
do_sample=True,
top_k=10,
num_return_sequences=1,
eos_token_id=tokenizer.eos_token_id,
max_length=400,
)

# for seq in sequences:
#   print(f"{seq['generated_text']}")

In [17]:
sequences[0]["generated_sequence"]["sequences"][0]

tensor([    1,  1128,   405, 13208,  4733,  1063,  7436,   297,   278,  1746,
          310,  4768,  3002,   322, 26602, 29973,    13,    13, 29940, 18771,
        17088, 10554,   292,   313, 29940, 13208, 29897,   756,  1063, 10231,
          368,  7436,   297,   278,  1746,   310,  4768,  3002,   322, 26602,
          297,  7786,  2440, 29892,   411,   263,  9377,  3464,   310,  8324,
         3704, 29901,    13,    13, 29896, 29889,  5739, 25426,  7418, 29901,
          405, 13208,   508,   367,  1304,   304, 27599,  2919, 18167,   310,
        20853,   293,   848, 29892,  3704, 18530,  4603,   848, 29892,   304,
        12439, 15038,   322,   534,  1975,   393,   508,  1871,  1749,  8004,
          310, 17135,  7208, 12903,   322, 14502, 16650,   583, 29889,    13,
        29906, 29889,   360, 11124, 20699, 29901,   405, 13208,   508,   367,
         1304,   304, 27599,  2919, 18167,   310,  4768, 27067,   936, 12845,
          304, 12439,  7037, 15721, 22525,   322,   304,  8500, 

In [75]:
tokenizer.decode(sequences[0]["generated_sequence"]["sequences"][0])

'<s> How NLP models been applied in the field of biology?\n\nNatural Language Processing (NLP) models have been increasingly applied in the field of biology to analyze and understand large amounts of biological data, such as genomic sequences, scientific articles, and clinical trial reports. Here are some ways NLP models have been applied in biology:\n\n1. Genomic analysis: NLP models have been used to analyze genomic sequences to identify variations, predict gene function, and understand gene regulation. For example, NLP models have been used to predict gene expression levels from RNA-seq data.\n2. Scientific article analysis: NLP models have been used to analyze scientific articles to identify key concepts, extract information, and predict the impact of research. For example, NLP models have been used to analyze the impact factor of scientific articles.\n3. Clinical trial analysis: NLP models have been used to analyze clinical trial reports to identify adverse events, predict patient

In [18]:
target = tokenizer.encode("NLP models have been increasingly applied in the field of biology")

In [20]:
import numpy as np
def fitness(sequence, pipeline, target_tokens, tokenizer, min_value=-10e2, max_length=300):
  sequences = pipeline(
                          sequence,
                          do_sample=True,
                          top_k=10,
                          num_return_sequences=1,
                          eos_token_id=tokenizer.eos_token_id,
                          max_length=max_length,
                        )
  scores = sequences[0]['generated_sequence']['scores']
  answer = tokenizer.decode(sequences[0]["generated_sequence"]["sequences"][0])
  best_fit = -np.inf

  for w in range(max_length):
    if w + len(target_tokens) <= len(scores):

      fit = [scores[w+i].squeeze()[token] for i,token in enumerate(target_tokens)]
      fit = list(map(lambda x: min_value if x == -np.inf else x, fit))
      sum_fit = sum(fit)

      if sum_fit > best_fit:
        best_fit = sum_fit
        sent =  [torch.argmax(scores[w+i].squeeze()) for i,token in enumerate(target_tokens)]
        #print(tokenizer.decode(sent))

  return best_fit

print(fitness("Hello",pipeline,target,tokenizer))
print(fitness("how is the pm of israel",pipeline,target,tokenizer))
print(fitness("what is comuptitonal biology",pipeline,target,tokenizer))
print(fitness("is NLP had been applied to biology",pipeline,target,tokenizer))


tensor(-13940.0195)
tensor(-12898.1299)
tensor(-12884.6758)
tensor(-12886.7393)


In [24]:
def fitness(sequence, pipeline, target, tokenizer):
  sequences = pipeline(
                          sequence,
                          do_sample=True,
                          top_k=10,
                          num_return_sequences=1,
                          eos_token_id=tokenizer.eos_token_id,
                          max_length=100,
                        )
  scores = sequences[0]['generated_sequence']['scores']
  answer = tokenizer.decode(sequences[0]["generated_sequence"]["sequences"][0])
  class_string = f'''You are tasked with evaluating the relevance of an answer to a given topic.

                        Answer: "{answer}"

                        Topic: "{target}"

                        Please follow these steps:

                        1. Evaluate how well the answer relates to the topic based on the following criteria:

                          - 0-0.2: The answer is not a cohesive sentence and/or completely off-topic.
                          - 0.2-0.5: The answer has little to no relevance to the topic.
                          - 0.5-0.7: The answer has some relevance but lacks detail or strong connection to the topic.
                          - 0.8-1: The answer is highly relevant, addressing the topic directly and providing clear insights.

                        2. Provide the result in the exact format: "The score is X", where X is the relevance score (a number between 0 and 1).

                        3. After giving the score, briefly explain your reasoning in 1-2 sentences.

                        Example Output:
                        "The score is 0.9. The answer directly addresses the topic and provides detailed information."
                          '''
  sequences = pipeline(
                      class_string,
                      do_sample=True,
                      top_k=10,
                      num_return_sequences=1,
                      eos_token_id=tokenizer.eos_token_id,
                      max_length=2500,
                    )
  print( tokenizer.decode(sequences[0]["generated_sequence"]["sequences"][0]))

fitness("is NLP been applied to biology? ",pipeline,"NLP in biology",tokenizer)

KeyboardInterrupt: 