## Set up

In [None]:
on_colab = False
do_install = False

In [None]:
# On google colab
if on_colab:
  from google.colab import drive
  drive.mount('/content/drive')
  %cd /content/drive/MyDrive/AI/bjj_coach/
  print(f"Content of current working directory:")
  %ls

Mounted at /content/drive
/content/drive/MyDrive/AI/bjj_coach


In [None]:
# install packages
if do_install:
    %pip install -q -U bitsandbytes
    %pip install -q -U git+https://github.com/huggingface/transformers.git
    %pip install -q -U git+https://github.com/huggingface/peft.git
    %pip install -q -U git+https://github.com/huggingface/accelerate.git
    
    %pip install -q datasets
    
    %pip install -q -U einops
    %pip install -q -U safetensors
    
    %pip install -q -U torch
    
    %pip install -q -U xformers
    %pip install -q -U ctransformers[cuda]
    %pip install -q -U sentence-transformers
    
    %pip install -q -U langchain
    %pip install -q chromadb
    

## Inference

### Inference with base model through *transformers*

In [None]:
#### CopyLeft Yogendra-Sisodia


##### Configuration (cuda wrapper) #####
import torch
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

##### LOADING MODEL from HF #####

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model_id = "mistralai/Mistral-7B-Instruct-v0.1"

# Load model
model_4bit = AutoModelForCausalLM.from_pretrained( model_id, device_map="auto",quantization_config=quantization_config, )

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Build HF generation pipeline
pipeline = pipeline(
        "text-generation",
        model=model_4bit,
        tokenizer=tokenizer,
        use_cache=True,
        device_map="auto",
        max_length=500,
        do_sample=True,
        top_k=5,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
)

In [None]:
##### Generation using LangChain pipeline #####

# Langchain generation pipeline on top of the HF generation pipeline
from langchain import HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=pipeline)

# Template for langchain prompt
template = """<s>[INST] You are a helpful, respectful and honest assistant. Answer exactly in few words from the context
Answer the question below from context below :
{context}
{question} [/INST] </s>
"""
# Building our prompt with the template and the two variable : the question and the context
from langchain import PromptTemplate
prompt = PromptTemplate(template=template, input_variables=["question","context"])

# Building the generation chain from our template prompt to the llm
from langchain import LLMChain
llm_chain = LLMChain(prompt=prompt, llm=llm)

In [None]:
##### Inference #####
# Question to fill the template with
question_p = """Who is John Danaher ? How influential has he been ? What part of BJJ did he developped ?"""
# Context to fill the template with
context_p = """ You are a Brazilian Juijitsu expert."""

# Inference with this particular question, context filling our template prompt using the llm:
response = llm_chain.run({"question":question_p,"context":context_p})
print(response)

### Inference with quantized model through *ctransformers*

In [None]:
#### CopyLeft Yogendra-Sisodia

##### LOAD QUANTIZED MODEL using CTransformers into a langchain ready llm #####
from langchain.llms import CTransformers
config = {'max_new_tokens': 100, 'temperature': 0}
llm = CTransformers(model='TheBloke/Mistral-7B-Instruct-v0.1-GGUF',model_file="mistral-7b-instruct-v0.1.Q4_K_M.gguf", config=config)

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

"John Danaher is a Brazilian Jiu-Jitsu practitioner and instructor known for his contributions to the sport's technical aspects. He has been highly influential in shaping the modern game through his innovative techniques and teaching methods."

In [None]:
##### Generation using LangChain pipeline #####

# Template for langchain prompt
template = """<s>[INST] You are a helpful, respectful and honest assistant. Answer exactly in few words from the context
Answer the question below from context below :
{context}
{question} [/INST] </s>
"""
# Building our prompt with the template and the two variable : the question and the context
from langchain import PromptTemplate
prompt = PromptTemplate(template=template, input_variables=["question","context"])

# Building the generation chain from our template prompt to the llm
from langchain import LLMChain
llm_chain = LLMChain(prompt=prompt, llm=llm)


In [None]:
##### Inference #####
# Question to fill the template with
question_p = """Who is John Danaher ? How influential has he been ? What part of BJJ did he developped ?"""
# Context to fill the template with
context_p = """ You are a Brazilian Juijitsu expert."""

# Inference with this particular question, context filling our template prompt using the llm:
response = llm_chain.run({"question":question_p,"context":context_p})
response

## Training and Inference with quantized model through *transformers*

Original blog post:
- https://huggingface.co/blog/4bit-transformers-bitsandbytes

Loading a model using bitsAndBytes, 4bit quantization, double quantization, NF, ...:

- https://colab.research.google.com/drive/1ge2F1QSK8Q7h0hn3YKuBCOAS0bK8E0wf?usp=sharing

Finetuning a model using these techniques :
- https://colab.research.google.com/drive/1VoYNfYDKcKRQRor98Zbf2-9VQTtGJ24k?usp=sharing



#### Inference with a quantized model

In [None]:
##### Choose the model #####

model_id = "facebook/opt-350m"
model_name = "opt350m"

#model_id = "mistralai/Mistral-7B-Instruct-v0.1"
#model_name = "mistal7B"

##### Quantization config #####
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

##### Load the model from Hugging Face #####
model_4bit = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_id)

## Configuration
#from transformers import GenerationConfig
# Download configuration from huggingface.co and cache.
#generation_config = GenerationConfig.from_pretrained(model_name)
## Saving configuration / loading locally saved configuration file
#generation_config.save_pretrained("./saved_model/", config_file_name="my_configuration.json")
#generation_config = GenerationConfig.from_pretrained("./saved_model/", "mistral7B_config.json")


In [None]:
model_4bit.hf_device_map

{'': 0}

In [None]:
### Generation : "Who is John Danaher ?" ###
text = "You are a martial art expert with a specialty in grappling arts (wrestling, judo, BJJ). Please answer the following question. Who is John Danaher ? How influential has he been ?"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model_4bit.generate(**inputs, max_new_tokens=200)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


You are a martial art expert with a specialty in grappling arts (wrestling, judo, BJJ). Please answer the following question. Who is John Danaher ? How influential has he been ?

John Danaher is a Brazilian Jiu-Jitsu (BJJ) practitioner and instructor who is widely regarded as one of the most influential figures in the sport. He is known for his innovative training methods and his ability to teach complex techniques to his students. Danaher has trained many of the top BJJ practitioners in the world, including Garrett Benson, Nicky Ryan, and Tom DeLonge.

Danaher has also been a pioneer in the use of video-based training, creating a series of instructional videos called "The Danaher Collection" that have become popular among BJJ practitioners. He has also been a vocal advocate for the use of data and analytics in training, and has developed a system for tracking and analyzing the performance of his students.

Overall, John Danaher has had a significant impact on the sport of BJJ and has 

In [None]:
### Generation : "what is an armbar" ###
context = "You are a martial art expert with a specialty in grappling arts (wrestling, judo, BJJ). Please answer the following question. "
question = "What is an armbar ? Describe how to apply it step by step."
prompt = context + question
device = "cuda:0"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

outputs = model_4bit.generate(**inputs, max_new_tokens=400)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


You are a martial art expert with a specialty in grappling arts (wrestling, judo, BJJ). Please answer the following question. What is an armbar ? Describe how to apply it step by step.

## Answer (1)

An armbar is a submission hold that is used to immobilize an opponent by applying pressure to the joints of the arm. It is a common submission hold in grappling arts such as judo, Brazilian Jiu-Jitsu, and wrestling.

Here are the steps to apply an armbar:

1. Start in a closed guard position, with your knees bent and your feet flat on the ground.
2. Use your legs to control your opponent's posture and prevent them from escaping.
3. Use your hips to lift your opponent off the ground, bringing their arm up and exposing the shoulder joint.
4. Use your hand to grab the opponent's arm and apply pressure to the shoulder joint, forcing them to submit.
5. Release the hold as soon as your opponent taps out or the referee stops the match.

It's important to note that armbars can be dangerous if not

In [None]:
### Generation : "What are leg locks ?" ###
context = "You are a martial art expert with a specialty in grappling arts (wrestling, judo, BJJ). Please answer the following question. "
question = "Educate me on leg locks."
prompt = context + question
device = "cuda:0"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

outputs = model_4bit.generate(**inputs, max_new_tokens=400)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


You are a martial art expert with a specialty in grappling arts (wrestling, judo, BJJ). Please answer the following question. Educate me on leg locks.

Leg locks are a type of grappling technique used in martial arts that involve immobilizing an opponent by applying pressure to their legs. These techniques can be used to subdue an opponent or to set up other grappling techniques.

There are several different types of leg locks, including:

1. Butterfly lock: This is a common leg lock used in Brazilian Jiu-Jitsu (BJJ). It involves wrapping both legs around an opponent's waist and applying pressure to their hips to immobilize them.
2. Americana: This is a leg lock used in Brazilian Jiu-Jitsu and other grappling arts. It involves wrapping one leg around an opponent's waist and applying pressure to their hips to immobilize them.
3. Kimura: This is a leg lock used in Brazilian Jiu-Jitsu and other grappling arts. It involves wrapping one leg around an opponent's waist and applying pressure t

#### QLora training

In [None]:
# Import datasets from Hugging Face, with a change in time thing to clear a bug on colab
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

if do_install:
    %pip install -q datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Import the training data
import pandas as pd
training_df = pd.read_csv("danaher_data.csv")
training_df.head()


# Get this data into Hugging Face datasets object
import datasets
from datasets import Dataset, DatasetDict
# from pandas df to hugging face dataset followed by a train-validation split
train_val_datasets = Dataset.from_pandas(training_df).train_test_split(0.1)
# tokenization
train_val_tokenized_dataset = train_val_datasets.map(lambda sample: tokenizer(sample["caption"]), batched=True)

# final datasets
train_tokenized_ds, val_tokenized_ds = train_val_tokenized_dataset["train"], train_val_tokenized_dataset["test"]


Map:   0%|          | 0/1076 [00:00<?, ? examples/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

In [None]:
train_tokenized_ds

Dataset({
    features: ['caption', 'input_ids', 'attention_mask'],
    num_rows: 1076
})

In [None]:
# Pre-processing to get the model reading for QLoRA training
from peft import prepare_model_for_kbit_training

model_4bit.gradient_checkpointing_enable()
model_4bit = prepare_model_for_kbit_training(model_4bit)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
# LoRA training config
from peft import LoraConfig

config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=None,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)


# get model ready for training
from peft import get_peft_model

model_4bit = get_peft_model(model_4bit, config)
# print the actual number of trainable parameters under LoRA training
print_trainable_parameters(model_4bit)

trainable params: 786432 || all params: 180463616 || trainable%: 0.43578424140631206


In [None]:
# Training loop

import transformers


# still need to add the compute_metrics arguments with a transformers.EvalPrediction object
##valpredictions = transformers.EvalPrediction()


training_params = transformers.TrainingArguments(
        per_device_train_batch_size=16,
        gradient_accumulation_steps=4,
        num_train_epochs=3,
        warmup_steps=2,
        learning_rate=2e-4,
        weight_decay=1e-5,
        fp16=True,
        evaluation_strategy="epoch",
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    )

trainer = transformers.Trainer(
    model=model_4bit,
    train_dataset=train_tokenized_ds,
    eval_dataset=val_tokenized_ds,
    args=training_params,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

In [None]:
model_4bit.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

trainer.save_model(f"./results/{model_name}")

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,3.4516,3.339878
2,3.3823,3.300882
3,3.2884,3.289176


Let's try the fine-tuned model !

In [None]:
### Generation : "What is an armbar ?" ###

context = "You are a martial art expert with a specialty in grappling arts (wrestling, judo, BJJ). Please answer the following question. "
question = "What is an armbar ? Describe how to apply it step by step."
prompt = context + question
device = "cuda:0"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

outputs = model_4bit.generate(**inputs, max_new_tokens=400)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))



You are a martial art expert with a specialty in grappling arts (wrestling, judo, BJJ). Please answer the following question. What is an armbar? Describe how to apply it step by step.

Answer:

Armbar is a technique that allows you to apply a combination of both a grip and a shoulder to a target. It is a very effective technique for both grappling and grappling. It is a very effective technique for both grappling and grappling. It is a very effective technique for both grappling and grappling. It is a very effective technique for both grappling and grappling. It is a very effective technique for both grappling and grappling. It is a very effective technique for both grappling and grappling. It is a very effective technique for both grappling and grappling. It is a very effective technique for both grappling and grappling. It is a very effective technique for both grappling and grappling. It is a very effective technique for both grappling and grappling. It is a very effective technique

In [None]:
# Generation : "What is an armbar ?" with 2-beam search
device = "cuda:0"
inputs = tokenizer("An armbar is ", return_tensors="pt").to(device)

outputs = model_4bit.generate(**inputs, max_new_tokens=400, num_beams=2)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

An armbar is  a type of grappling technique that involves the submission of a target with a combination of both hands and feet. It’s a technique that has been used in Jiu jitsu for centuries. It’s a technique that has been used in Jiu jitsu for centuries. It’s a technique that has been used in Jiu jitsu for centuries. It’s a technique that has been used in Jiu jitsu for centuries. It’s a technique that has been used in Jiu jitsu for centuries. It’s a technique that has been used in Jiu jitsu for centuries. It’s a technique that has been used in Jiu jitsu for centuries. It’s a technique that has been used in Jiu jitsu for centuries. It’s a technique that has been used in Jiu jitsu for centuries. It’s a technique that has been used in Jiu jitsu for centuries. It’s a technique that has been used in Jiu jitsu for centuries. It’s a technique that has been used in Jiu jitsu for centuries. It’s a technique that has been used in Jiu jitsu for centuries. It’s a technique that has been used in J

In [None]:
### Generation : "An armbar is " ###

device = "cuda:0"
inputs = tokenizer("An armbar is ", return_tensors="pt").to(device)

outputs = model_4bit.generate(**inputs, max_new_tokens=400)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

An armbar is  a very dangerous and dangerous combination. It is a combination of two very dangerous and dangerous combinations. It is a combination of two very dangerous and dangerous combinations. It is a combination of two very dangerous and dangerous combinations. It is a combination of two very dangerous and dangerous combinations. It is a combination of two very dangerous and dangerous combinations. It is a combination of two very dangerous and dangerous combinations. It is a combination of two very dangerous and dangerous combinations. It is a combination of two very dangerous and dangerous combinations. It is a combination of two very dangerous and dangerous combinations. It is a combination of two very dangerous and dangerous combinations. It is a combination of two very dangerous and dangerous combinations. It is a combination of two very dangerous and dangerous combinations. It is a combination of two very dangerous and dangerous combinations. It is a combination of two very 

In [None]:
### Generation : "An armbar is " ### with 3-beal search and top_p sampling

device = "cuda:0"
inputs = tokenizer("An armbar is ", return_tensors="pt").to(device)

outputs = model_4bit.generate(**inputs, max_new_tokens=400, num_beams=3, do_sample=True, top_p=0.1)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

An armbar is  a very dangerous and dangerous combination. It is a combination of two very dangerous and dangerous combinations. It is a combination of two very dangerous and dangerous combinations. It is a combination of two very dangerous and dangerous combinations. It is a combination of two very dangerous and dangerous combinations. It is a combination of two very dangerous and dangerous combinations. It is a combination of two very dangerous and dangerous combinations. It is a combination of two very dangerous and dangerous combinations. It is a combination of two very dangerous and dangerous combinations. It is a combination of two very dangerous and dangerous combinations. It is a combination of two very dangerous and dangerous combinations. It is a combination of two very dangerous and dangerous combinations. It is a combination of two very dangerous and dangerous combinations. It is a combination of two very dangerous and dangerous combinations. It is a combination of two very 