In [1]:
import pandas as pd

df = pd.read_csv("winamax_faq.csv")
df.rename(columns={'questions': 'instruction'}, inplace=True)
df['input'] = ""
df.rename(columns={'answers': 'output'}, inplace=True)
df.to_csv("data/train.csv", index=False)

df = df.fillna("")

text_col = []

for _, row in df.iterrows():
    prompt = "Below is an instruction that describes a task, paired with"
    instruction = str(row["instruction"])
    input_query = str(row["input"])
    response = str(row["output"])

    if len(input_query.strip()) == 0:
        text = prompt + "### Instruction: " + instruction + "\n### Response:\n" + response
    else:
      text = (
          prompt
          + "### Instruction:\n"
          + instruction
          + "\n### Input:\n"
          + input_query
          + "\n### Response:\n"
          + response
      )
    text_col.append(text)

df.loc[:, "text"] = text_col
df.to_csv("data/train.csv", index=False)




In [2]:
#@title 🤗 AutoTrain LLM
#@markdown In order to use this colab
#@markdown - upload train.csv to a folder named `data/`
#@markdown - train.csv must contain a `text` column
#@markdown - choose a project name if you wish
#@markdown - change model if you wish, you can use most of the text-generation models from Hugging Face Hub
#@markdown - add huggingface information (token and repo_id) if you wish to push trained model to huggingface hub
#@markdown - update hyperparameters if you wish
#@markdown - click `Runtime > Run all` or run each cell individually

import os
!pip install -U autotrain-advanced > install_logs.txt
!autotrain setup --colab > setup_logs.txt
!pip install huggingface_hub

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
lida 0.0.10 requires kaleido, which is not installed.
llmx 0.0.15a0 requires cohere, which is not installed.
llmx 0.0.15a0 requires openai, which is not installed.
tensorflow 2.13.0 requires typing-extensions<4.6.0,>=3.6.6, but you have typing-extensions 4.8.0 which is incompatible.
tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 4.23.4 which is incompatible.[0m[31m
[0m> [1mINFO    Installing latest transformers@main[0m
> [1mINFO    Successfully installed latest transformers[0m
> [1mINFO    Installing latest peft@main[0m
> [1mINFO    Successfully installed latest peft[0m
> [1mINFO    Installing latest diffusers@main[0m
> [1mINFO    Successfully installed latest diffusers[0m
> [1mINFO    Installing latest trl@main[0m
> [1mINFO    Successfully installed latest

In [3]:
!autotrain setup --update-torch

    PyTorch 2.1.0+cu121 with CUDA 1201 (you have 2.1.0+cu118)
    Python  3.10.13 (you have 3.10.12)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details
> [1mINFO    Installing latest transformers@main[0m
> [1mINFO    Successfully installed latest transformers[0m
> [1mINFO    Installing latest peft@main[0m
> [1mINFO    Successfully installed latest peft[0m
> [1mINFO    Installing latest diffusers@main[0m
> [1mINFO    Successfully installed latest diffusers[0m
> [1mINFO    Installing latest trl@main[0m
> [1mINFO    Successfully installed latest trl[0m
> [1mINFO    Installing latest xformers[0m
> [1mINFO    Successfully installed latest xformers[0m
> [1mINFO    Installing latest PyTorch[0m
> [1mINFO    Successfully installed latest PyTorch[0m


In [4]:
#@markdown ---
#@markdown #### Project Config
#@markdown Note: if you are using a restricted/private model, you need to enter your Hugging Face token in the next step.
project_name = 'my_autotrain_llm' # @param {type:"string"}
model_name = 'abhishek/llama-2-7b-hf-small-shards' # @param {type:"string"}

#@markdown ---
#@markdown #### Push to Hub?
#@markdown Use these only if you want to push your trained model to a private repo in your Hugging Face Account
#@markdown If you dont use these, the model will be saved in Google Colab and you are required to download it manually.
#@markdown Please enter your Hugging Face write token. The trained model will be saved to your Hugging Face account.
#@markdown You can find your token here: https://huggingface.co/settings/tokens
push_to_hub = False # @param ["False", "True"] {type:"raw"}
hf_token = "hf_XXX" #@param {type:"string"}
repo_id = "username/repo_name" #@param {type:"string"}

#@markdown ---
#@markdown #### Hyperparameters
learning_rate = 2e-4 # @param {type:"number"}
num_epochs = 1 #@param {type:"number"}
batch_size = 1 # @param {type:"slider", min:1, max:32, step:1}
block_size = 1024 # @param {type:"number"}
trainer = "sft" # @param ["default", "sft"] {type:"raw"}
warmup_ratio = 0.1 # @param {type:"number"}
weight_decay = 0.01 # @param {type:"number"}
gradient_accumulation = 4 # @param {type:"number"}
use_fp16 = True # @param ["False", "True"] {type:"raw"}
use_peft = True # @param ["False", "True"] {type:"raw"}
use_int4 = True # @param ["False", "True"] {type:"raw"}
lora_r = 16 #@param {type:"number"}
lora_alpha = 32 #@param {type:"number"}
lora_dropout = 0.05 #@param {type:"number"}

os.environ["PROJECT_NAME"] = project_name
os.environ["MODEL_NAME"] = model_name
os.environ["PUSH_TO_HUB"] = str(push_to_hub)
os.environ["HF_TOKEN"] = hf_token
os.environ["REPO_ID"] = repo_id
os.environ["LEARNING_RATE"] = str(learning_rate)
os.environ["NUM_EPOCHS"] = str(num_epochs)
os.environ["BATCH_SIZE"] = str(batch_size)
os.environ["BLOCK_SIZE"] = str(block_size)
os.environ["WARMUP_RATIO"] = str(warmup_ratio)
os.environ["WEIGHT_DECAY"] = str(weight_decay)
os.environ["GRADIENT_ACCUMULATION"] = str(gradient_accumulation)
os.environ["USE_FP16"] = str(use_fp16)
os.environ["USE_PEFT"] = str(use_peft)
os.environ["USE_INT4"] = str(use_int4)
os.environ["LORA_R"] = str(lora_r)
os.environ["LORA_ALPHA"] = str(lora_alpha)
os.environ["LORA_DROPOUT"] = str(lora_dropout)


In [5]:
!autotrain llm \
--train \
--model ${MODEL_NAME} \
--project-name ${PROJECT_NAME} \
--data-path data/ \
--text-column text \
--lr ${LEARNING_RATE} \
--batch-size ${BATCH_SIZE} \
--epochs ${NUM_EPOCHS} \
--block-size ${BLOCK_SIZE} \
--warmup-ratio ${WARMUP_RATIO} \
--lora-r ${LORA_R} \
--lora-alpha ${LORA_ALPHA} \
--lora-dropout ${LORA_DROPOUT} \
--weight-decay ${WEIGHT_DECAY} \
--gradient-accumulation ${GRADIENT_ACCUMULATION} \
$( [[ "$USE_FP16" == "True" ]] && echo "--fp16" ) \
$( [[ "$USE_PEFT" == "True" ]] && echo "--use-peft" ) \
$( [[ "$USE_INT4" == "True" ]] && echo "--use-int4" ) \
$( [[ "$PUSH_TO_HUB" == "True" ]] && echo "--push-to-hub --token ${HF_TOKEN} --repo-id ${REPO_ID}" )

    PyTorch 2.1.0+cu121 with CUDA 1201 (you have 2.1.0+cu118)
    Python  3.10.13 (you have 3.10.12)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details
> [1mINFO    Running LLM[0m
> [1mINFO    Params: Namespace(version=False, train=True, deploy=False, inference=False, data_path='data/', train_split='train', valid_split=None, text_column='text', rejected_text_column='rejected', model='abhishek/llama-2-7b-hf-small-shards', learning_rate=0.0002, num_train_epochs=1, train_batch_size=1, warmup_ratio=0.1, gradient_accumulation_steps=4, optimizer='adamw_torch', scheduler='linear', weight_decay=0.01, max_grad_norm=1.0, seed=42, add_eos_token=False, block_size=1024, use_peft=True, lora_r=16, lora_alpha=32, lora_dropout=0.05, logging_steps=-1, project_name='my_autotrain_llm', evaluation_strategy='epoch', save_total_limit=1, sa

In [6]:
!zip -r /content/my_autotrain_llm.zip /content/my_autotrain_llm

  adding: content/my_autotrain_llm/ (stored 0%)
  adding: content/my_autotrain_llm/added_tokens.json (stored 0%)
  adding: content/my_autotrain_llm/checkpoint-2/ (stored 0%)
  adding: content/my_autotrain_llm/checkpoint-2/added_tokens.json (stored 0%)
  adding: content/my_autotrain_llm/checkpoint-2/adapter_model.bin (deflated 8%)
  adding: content/my_autotrain_llm/checkpoint-2/tokenizer.model (deflated 55%)
  adding: content/my_autotrain_llm/checkpoint-2/trainer_state.json (deflated 50%)
  adding: content/my_autotrain_llm/checkpoint-2/special_tokens_map.json (deflated 71%)
  adding: content/my_autotrain_llm/checkpoint-2/rng_state.pth (deflated 25%)
  adding: content/my_autotrain_llm/checkpoint-2/pytorch_model.bin (deflated 65%)
  adding: content/my_autotrain_llm/checkpoint-2/README.md (deflated 67%)
  adding: content/my_autotrain_llm/checkpoint-2/training_args.bin (deflated 50%)
  adding: content/my_autotrain_llm/checkpoint-2/tokenizer.json (deflated 74%)
  adding: content/my_autotrain