In [1]:
from transformers import AutoTokenizer, pipeline, logging
import argparse

MEX_NEW_TOKENS = 16


## Load models

### Falcon 7B

In [2]:
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig

model_name_or_path = "TheBloke/falcon-7b-instruct-GPTQ"

model_basename = "gptq_model-4bit-64g"

use_triton = False

tokenizer_falcon = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

model_falcon = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
        model_basename=model_basename,
        use_safetensors=True,
        trust_remote_code=True,
        device="cuda:0",
        use_triton=use_triton,
        quantize_config=None)

pipe_falcon = pipeline(
    "text-generation",
    model=model_falcon,
    tokenizer=tokenizer_falcon,
    max_new_tokens=16,
    temperature=0.7,
    top_p=0.95,
    repetition_penalty=1.15,
    pad_token_id=tokenizer_falcon.eos_token_id,
)


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
binary_path: c:\Users\Marwa\AppData\Local\Programs\Python\Python310\lib\site-packages\bitsandbytes\cuda_setup\libbitsandbytes_cuda116.dll
CUDA SETUP: Loading binary c:\Users\Marwa\AppData\Local\Programs\Python\Python310\lib\site-packages\bitsandbytes\cuda_setup\libbitsandbytes_cuda116.dll...


CUDA extension not installed.
The safetensors archive passed at C:\Users\Marwa/.cache\huggingface\hub\models--TheBloke--falcon-7b-instruct-GPTQ\snapshots\78e749e37afef2600decdc389a5c65c82c7589f4\gptq_model-4bit-64g.safetensors does not contain metadata. Make sure to save your model with the `save_pretrained` method. Defaulting to 'pt' metadata.
can't get model's sequence length from model config, will set to 4096.
RWGPTQForCausalLM hasn't fused attention module yet, will skip inject fused attention.
RWGPTQForCausalLM hasn't fused mlp module yet, will skip inject fused mlp.
A matching Triton is not available, some optimizations will not be enabled.
Error caught was: No module named 'triton'
The model 'RWGPTQForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', '

### OpenAI-GPT

In [3]:
from transformers import OpenAIGPTTokenizer, OpenAIGPTModel
import torch

pipe_gpt = pipeline('text-generation', model='openai-gpt')


## Evaluate

In [5]:
import evaluate

custom_metric = evaluate.load("./custom_metric.py")

input_prompts = [
    "Once upon a time",
    "To be or not to be, that is the",
    "The quick brown fox jumps over the",
]

reference_texts = [
    ["Once upon a time, there was a beautiful princess."],
    ["To be or not to be, that is the question."],
    ["The quick brown fox jumps over the lazy dog."],
]

In [6]:
%%time

predictions_faclon = [pipe_falcon(prompt)[0]["generated_text"].split('\n')[0] for prompt in input_prompts]
predictions_faclon

CPU times: total: 13.4 s
Wall time: 34.7 s


['Once upon a time, there was a little girl named Alice. She was a curious little girl,',
 'To be or not to be, that is the question:',
 'The quick brown fox jumps over the lazy dog.']

In [7]:
%%time

predictions_gpt = [pipe_gpt(prompt, max_length=MEX_NEW_TOKENS, num_return_sequences=1)[0]["generated_text"].split('\n')[0] for prompt in input_prompts]
predictions_gpt

CPU times: total: 3.78 s
Wall time: 1.97 s


["Once upon a time, and i 'd heard all that, but i knew i",
 'To be or not to be, that is the case, " he answered coldly',
 'The quick brown fox jumps over the side on long, black legs, and leaps']

In [8]:
score_falcon = custom_metric.compute(predictions=predictions_faclon, references=reference_texts)
print("falcon:", score_falcon)
score_gpt = custom_metric.compute(predictions=predictions_gpt, references=reference_texts)
print("gpt:", score_gpt)


falcon: {'sacrebleu_score': 65.52854486644404}
gpt: {'sacrebleu_score': 39.5929619041636}
