In [1]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [2]:
!nvidia-smi

Sun Apr 28 17:45:05 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L4                      Off | 00000000:00:03.0 Off |                    0 |
| N/A   30C    P8              12W /  72W |      1MiB / 23034MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
import pandas as pd
import locale
import torch
import re
import transformers
from transformers import AutoTokenizer

locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')


if torch.cuda.is_available():
  torch.set_default_device("cuda")
  print("cuda")
else:
  torch.set_default_device("cpu")

def import_tsv(file_path):
    return pd.read_csv(file_path, sep='\t')


# Extract of to 5 terms
def extract_difficult_terms(model, pipeline, tokenizer, passages):
    passage_to_terms = {}
    for passage in passages:
        input_text = f'''
                     extract at the most 5 difficult to understand terms from this
                     passage that are required to understand the passage.
                     You do not have to always extract 5 terms, only choose difficult terms neccesary for understanding.
                     do not define them, just list them. \n
                     {passage}
                     '''

        messages = [ {"role": "system", "content": "You are a proof-reader who identifies difficult terms in research papers."},
                  {"role": "user", "content": input_text},]
        prompt = pipeline.tokenizer.apply_chat_template(messages, tokenize=False,
                                                    add_generation_prompt=True)
        terminators = [ pipeline.tokenizer.eos_token_id, pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")]

        outputs = pipeline(prompt, max_new_tokens=10000, eos_token_id=terminators, do_sample=True, temperature=0.6, top_p=0.9)
        result = outputs[0]["generated_text"][len(prompt):]

        # Remove LLM introduction phrase for result
        lines = result.split('\n')
        remaining_lines = lines[1:]
        result = '\n'.join(remaining_lines)

        passage_to_terms[passage] = result

    return passage_to_terms


# Provide an explanation for the selected difficult terms
def explain_difficult_terms(model, pipeline, tokenizer, passage_to_difficult_terms):
    terms_to_explanations = {}

    for passage, terms in passage_to_difficult_terms.items():
        input_text = f'''
                  provide a breif one sentence definition for each of the following terms targeted towards a general audience. \n
                  {terms}
                  '''
        messages = [ {"role": "system", "content": "You are a polymath who can effectively describe complex terms to a general audience within one sentence."},
                  {"role": "user", "content": input_text},]
        prompt = pipeline.tokenizer.apply_chat_template(messages, tokenize=False,
                                                    add_generation_prompt=True)
        terminators = [pipeline.tokenizer.eos_token_id, pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")]

        outputs = pipeline(prompt, max_new_tokens=10000, eos_token_id=terminators, do_sample=True, temperature=0.6, top_p=0.9)
        result = outputs[0]["generated_text"][len(prompt):]

        # Remove LLM introduction phrase for result
        lines = result.split('\n')
        remaining_lines = lines[1:]
        result = '\n'.join(remaining_lines)

        terms_to_explanations[terms] = result

    return terms_to_explanations


def main():
    model = "meta-llama/Meta-Llama-3-8B-Instruct"
    tokenizer = AutoTokenizer.from_pretrained(model)
    pipeline = transformers.pipeline("text-generation", model=model,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device="cuda")

    # Testing data
    data = import_tsv("simpletext-task2-test-small.tsv")
    passage_ids = data["snt_id"].tolist()
    passages = data["source_snt"].tolist()



    passage_to_difficult_terms = extract_difficult_terms(model, pipeline, tokenizer,
                                                         passages)

    terms_to_definitions = explain_difficult_terms(model, pipeline, tokenizer, passage_to_difficult_terms)

    passages_list = []
    terms_list = []
    explanations_list = []


    for passage, terms in passage_to_difficult_terms.items():
        explanation = terms_to_definitions.get(terms, "")
        passages_list.append(passage)
        terms_list.append(terms.strip())
        explanations_list.append(explanation)

    results_df = pd.DataFrame({
        "snt_id": passage_ids,
        "Passage": passages_list,
        "Identified Difficult Terms": terms_list,
        "Explanations": explanations_list
    })


    results_df.to_csv("results.tsv", sep="\t", index=False)
    !mv results.tsv /content/


main()

cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:12

KeyboardInterrupt: 