In [1]:
!pip install -U pip
!pip install bitsandbytes==0.39.0
!pip install torch==2.0.1
!pip install transformers==4.30.0
!pip install accelerate==0.20.3
!pip install git+https://github.com/huggingface/peft.git
!pip install datasets==2.12.0
!pip install loralib==0.1.1
!pip install einops==0.6.1
!pip install langchain
!pip install xformers

Collecting bitsandbytes==0.39.0
  Downloading bitsandbytes-0.39.0-py3-none-any.whl (92.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/92.2 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.39.0
Collecting transformers==4.30.0
  Downloading transformers-4.30.0-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m86.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers==4.30.0)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.30.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m

In [1]:
import json
import os
from pprint import pprint

import bitsandbytes as bnb
import langchain
from langchain.llms import HuggingFacePipeline
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.chains import ConversationChain
import torch
import transformers
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training
)
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig
)


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /usr/local/lib/python3.10/dist-packages/bitsandbytes/libbitsandbytes_cuda118.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /usr/local/lib/python3.10/dist-packages/bitsandbytes/libbitsandbytes_cuda118.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
DEVICE = 'cuda:0'
COLAB_PATH = None

### If on google colab, uncomment the following ###
from google.colab import drive
import locale
locale.getpreferredencoding = lambda: "UTF-8"
drive.mount('/content/drive')
COLAB_PATH = '/content/drive/MyDrive/gntLM/FinalGNTLM'
###

# default inference on proposed model
EXPERIMENT = 'experiment_4'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
def get_local_peft_model(experiment_name: str=EXPERIMENT, initial: bool=True) -> str:
    map_experiment_name_to_model = {
          'experiment_1': 'experiment_1_model',
          'experiment_2': 'experiment_2_model',
          'experiment_3': 'experiment_3_model',
          'experiment_4': 'experiment_4_model'
    }
    try:
      return map_experiment_name_to_model[experiment_name]
    except KeyError:
      if initial:
        raise KeyError(f'Please choose a name in {list(map_experiment_name_to_model.keys())}. If you want to load a different model, please provide the model directory and set initial=False')
      else:
        return experiment_name


def normalize_response(chat_chain: langchain.chains.ConversationChain, query: str) -> str:
    chat_chain.predict(input=query)

    chat.memory.chat_memory.messages[-1].content = chat.memory.chat_memory.messages[-1].content.split('\n\n')[0]
    chat.memory.chat_memory.messages[-1].content = chat.memory.chat_memory.messages[-1].content.strip()

    for stop_text in ['Human:', 'AI:']:
        chat.memory.chat_memory.messages[-1].content = chat.memory.chat_memory.messages[-1].content.removesuffix(stop_text)

    chat.memory.chat_memory.messages[-1].content = chat.memory.chat_memory.messages[-1].content.strip()

    return chat_chain.memory.chat_memory.messages[-1].content

In [4]:
GNTLM = get_local_peft_model()
if COLAB_PATH:
  PEFT_MODEL = f'{COLAB_PATH}/{GNTLM}'
else:
  PEFT_MODEL = f'/{GNTLM}'

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16
)

config = PeftConfig.from_pretrained(PEFT_MODEL)
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    return_dict=True,
    quantization_config=bnb_config,
    device_map='auto',
    trust_remote_code=True,
    load_in_4bit=True
)

tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.bos_token_id = 1

model = PeftModel.from_pretrained(model, PEFT_MODEL)

Downloading (…)lve/main/config.json:   0%|          | 0.00/950 [00:00<?, ?B/s]

Downloading (…)/configuration_RW.py:   0%|          | 0.00/2.61k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-7b:
- configuration_RW.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)main/modelling_RW.py:   0%|          | 0.00/47.6k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-7b:
- modelling_RW.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)model.bin.index.json:   0%|          | 0.00/16.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/220 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.73M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

In [8]:
stop_token_ids = [
    tokenizer.convert_tokens_to_ids(x) for x in [
        ['Human', ':'], ['AI', ':']
    ]
]

stop_token_ids = [torch.LongTensor(x).to(DEVICE) for x in stop_token_ids]

In [9]:
class StopOnTokens(transformers.StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

In [10]:
stopping_criteria = transformers.StoppingCriteriaList([StopOnTokens()])

In [11]:
generate_text = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,
    task='text-generation',
    stopping_criteria=stopping_criteria,
    temperature=0.7,
    top_p=0.7,
    top_k=0,
    max_new_tokens=128,
    repetition_penalty=1.5
)

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWithLMHead', 'RemBertForCausalLM', 'RobertaForCausalLM', 'RobertaPreLayerN

In [12]:
prompt = langchain.PromptTemplate(
    input_variables=["instruction"],
    template="{instruction}"
)

llm = HuggingFacePipeline(pipeline=generate_text)
llm_chain = langchain.LLMChain(llm=llm, prompt=prompt)

memory = ConversationBufferWindowMemory(
    memory_key="history",
    k=5,
    return_only_outputs=True
)

chat = ConversationChain(
    llm=llm,
    memory=memory,
    verbose=False
)

chat.prompt.template = \
"""The following is a conversation between a human and an guidance counseling AI. The AI is talkative and provides lots of specific details from its context and focuses on answering the question posed by the human. If the AI does not know the answer to a question, it truthfully says it does not know.

Current conversation:
{history}
Human: {input}
AI:"""

In [13]:
normalize_response(chat, 'I have a strong sense of justice and can persuade people with my speech prowess. What should my future occupation be?')

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


'You are very persuasive and passionate about your beliefs. Your ability to communicate effectively makes you well suited for public speaking or debate competitions. Perhaps becoming a lawyer or politician would also suit you as these careers often require persuasion skills. However, if you prefer working independently rather than in teams, consider being a writer or consultant. These occupations typically involve communicating ideas and persuading others through written text or spoken words respectively.'

In [None]:
normalize_response(chat, 'Could you please give me a couple more choices? I do not really want to become a politician.')

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


'Based on your current interests and passions, there are several other career options that may suit you better than being either a lawyer or politician. Some alternatives include working as a journalist or author, both roles requiring excellent writing abilities and knowledge about societal issues. Another option would be joining a non-profit organization where you could use your persuasion skills to advocate for specific causes. Finally, if you enjoy public speaking and debating topics related to law and politics, perhaps teaching or consulting might interest you.'