## Resources

[GPT illustration](https://jalammar.github.io/illustrated-gpt2/)

[Fine tune casual LM and mask LM in colab](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)

[Causal language modeling](https://huggingface.co/docs/transformers/tasks/language_modeling)



In [15]:
import pandas as pd
import numpy as np
from IPython.display import display, HTML
import html
from datasets import load_dataset
from transformers import AutoTokenizer

## Work with dataset

[Reference](https://huggingface.co/learn/llm-course/chapter5/3)

If you've loaded a dataFrame use this method:

```py
from datasets import Dataset, DatasetDict

full_dataset = Dataset.from_pandas(df)
dataset = full_dataset.train_test_split(test_size=0.2, seed=42)

# --------------------------------------------------------------
# Or
train_size = int(0.8 * len(df))
val_size = train_size + int(0.1 * len(df))
train_df = df[: train_size]
val_df = df[train_size: val_size]
test_df = df[val_size:]

# Create datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Create DatasetDict
dataset = DatasetDict({
    "train": train_dataset,
    "validation": valval_dataset_df,
    "test": test_dataset
})
```

In [None]:
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
!unzip drugsCom_raw.zip

In [17]:
df = pd.read_csv('/content/drugsComTest_raw.tsv', sep = '\t') # TSV use tab (\t)) as seperator
display(HTML(df.head(3).to_html()))

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,163740,Mirtazapine,Depression,"""I&#039;ve tried a few antidepressants over the years (citalopram, fluoxetine, amitriptyline), but none of those helped with my depression, insomnia &amp; anxiety. My doctor suggested and changed me onto 45mg mirtazapine and this medicine has saved my life. Thankfully I have had no side effects especially the most common - weight gain, I&#039;ve actually lost alot of weight. I still have suicidal thoughts but mirtazapine has saved me.""",10.0,"February 28, 2012",22
1,206473,Mesalamine,"Crohn's Disease, Maintenance","""My son has Crohn&#039;s disease and has done very well on the Asacol. He has no complaints and shows no side effects. He has taken as many as nine tablets per day at one time. I&#039;ve been very happy with the results, reducing his bouts of diarrhea drastically.""",8.0,"May 17, 2009",17
2,159672,Bactrim,Urinary Tract Infection,"""Quick reduction of symptoms""",9.0,"September 29, 2017",3


### Feed dataset into dataset_load class

In [None]:
# Before feeding data into load_dataset class, we should split it, we can do it later though.
data_files = {"train": "/content/drugsComTrain_raw.tsv", "test": "/content/drugsComTest_raw.tsv"}
dataset = load_dataset("csv", data_files = data_files, delimiter = '\t')
dataset = dataset.rename_column("Unnamed: 0", "id")

In [19]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [20]:
sample = dataset["train"].shuffle(seed=42).select(range(10))
sample[1:3]

{'id': [178045, 80482],
 'drugName': ['Duloxetine', 'Mobic'],
 'condition': ['ibromyalgia', 'Inflammatory Conditions'],
 'review': ['"I have taken Cymbalta for about a year and a half for fibromyalgia pain. It is great\r\nas a pain reducer and an anti-depressant, however, the side effects outweighed \r\nany benefit I got from it. I had trouble with restlessness, being tired constantly,\r\ndizziness, dry mouth, numbness and tingling in my feet, and horrible sweating. I am\r\nbeing weaned off of it now. Went from 60 mg to 30mg and now to 15 mg. I will be\r\noff completely in about a week. The fibro pain is coming back, but I would rather deal with it than the side effects."',
  '"I have been taking Mobic for over a year with no side effects other than an elevated blood pressure.  I had severe knee and ankle pain which completely went away after taking Mobic.  I attempted to stop the medication however pain returned after a few days."'],
 'rating': [3.0, 10.0],
 'date': ['November 7, 2011

### Normalize data

In [21]:
# we want to conver "condition" to. lower
# First we need to remove none items
dataset = dataset.filter(lambda row: row["condition"] is not None)

Filter:   0%|          | 0/161297 [00:00<?, ? examples/s]

Filter:   0%|          | 0/53766 [00:00<?, ? examples/s]

In [None]:
# When we use batch, it feeds data in a list, so we can not map a list to lower()
# dataset = dataset.map(lambda row: {"condition": row["condition"].lower()})

# If you want to use batch:
dataset = dataset.map(
    lambda row: {"condition": [item.lower() for item in row["condition"]]}, batched=True
)

Map:   0%|          | 0/160398 [00:00<?, ? examples/s]

Map:   0%|          | 0/53471 [00:00<?, ? examples/s]

In [None]:
# To enable multiprocessing, use the `num_proc = 8`
#       as long as the function you are using is not already doing some kind of multiprocessing of its own.
dataset = dataset.map(lambda row: {"review_length": len(row["review"].split())}, batch_size = True)

Map:   0%|          | 0/160398 [00:00<?, ? examples/s]

Map:   0%|          | 0/53471 [00:00<?, ? examples/s]

In [None]:
dataset["train"][0]

{'id': 206461,
 'drugName': 'Valsartan',
 'condition': 'left ventricular dysfunction',
 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27,
 'review_length': 17}

In [None]:
# remove data with review length lower than 30
dataset = dataset.filter(lambda row: row["review_length"] > 30)

Filter:   0%|          | 0/160398 [00:00<?, ? examples/s]

Filter:   0%|          | 0/53471 [00:00<?, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 138514
    })
    test: Dataset({
        features: ['id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})

In [None]:
# Remove HTML tags
dataset = dataset.map(
    lambda row: {"review": [html.unescape(item) for item in row["review"]]}, batched = True
)

Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

In [None]:
# checkpoint = "bert-base-cased"
checkpoint = "distilgpt2"
# checkpoint = "openai-community/gpt2"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [77]:
# some sets became two features because it was tokenized to more than the maximum number of tokens we specified (128)
# so we need to remove the columns

def tokenize_and_split(examples):
    return tokenizer(
        examples["review"],
        truncation = True,
        padding = False,
        max_length = 128,
        return_overflowing_tokens = True,
        return_tensors = None
    )

tokenized_dataset = dataset.map(
    tokenize_and_split,
    batched = True,
    remove_columns = dataset['train'].column_names  # Remove old columns
)


# # Or making the old columns the same size as the new ones.
# # This way we can keep original columns too
# def tokenize_and_split(examples):
#     result = tokenizer(
#         examples["review"],
#         truncation = True,
#         max_length = 128,
#         return_overflowing_tokens = True,
#     )
#     # Extract mapping between new and old indices
#     sample_map = result.pop("overflow_to_sample_mapping")
#     for key, values in examples.items():
#         result[key] = [values[i] for i in sample_map]
#     return result


# tokenized_dataset = dataset.map(
#     tokenize_and_split, batched = True #, remove_columns = dataset["train"].column_names
# )

# # Remove unnecessary columns
# columns_to_remove = ['id', 'drugName', 'condition', 'date', 'usefulCount', 'review_length', 'review']
# tokenized_dataset = tokenized_dataset.remove_columns(columns_to_remove)
# tokenized_dataset

Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

In [78]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'overflow_to_sample_mapping'],
        num_rows: 200764
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'overflow_to_sample_mapping'],
        num_rows: 66906
    })
})

#### Change dataset format

`Dataset.set_format()` function. This function only changes the output format of the dataset, so you can easily switch to another format without affecting the underlying data format, which is `Apache Arrow`.

```py
dataset.set_format("pandas")

# revert back
dataset.reset_format()
```

In [79]:
# Create validation split
dataset_clean = tokenized_dataset["train"].train_test_split(train_size=0.8, seed=42)

# Rename the default "test" split to "validation"
dataset_clean["validation"] = dataset_clean.pop("test")

# Add the "test" set to our `DatasetDict`
dataset_clean["test"] = tokenized_dataset["test"]
dataset_clean

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'overflow_to_sample_mapping'],
        num_rows: 160611
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'overflow_to_sample_mapping'],
        num_rows: 40153
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'overflow_to_sample_mapping'],
        num_rows: 66906
    })
})

### Save file

Data format | Function
------------|---------
Arrow | Dataset.save_to_disk()
CSV | Dataset.to_csv()
JSON | Dataset.to_json()

### Load

```py
from datasets import load_from_disk

ataset_reloaded = load_from_disk("data")
```

## LoRA

LoRA can be applied to any subset of weight matrices in a neural network to reduce the number of trainable parameters.

However, in Transformer models LoRA is typically applied to **attention blocks only**.

<br>

> [LoRA](https://huggingface.co/docs/peft/main/en/conceptual_guides/lora) <br>
[conceptual guide](https://huggingface.co/docs/peft/main/en/conceptual_guides/adapter) <br>
[QLoRA](https://huggingface.co/docs/peft/en/developer_guides/quantization)

<br>

- `merge_adapter()` to merge the LoRa layers into the base model.
- `unmerge_adapter()` to unmerge the LoRa layers from the base model.
- `unload()` to get back the base model without the merging of the active lora modules.
- `delete_adapter()` to delete an existing adapter.
- `add_weighted_adapter()` to combine multiple LoRAs into a new adapter based on the user provided weighing scheme.

### Workflow:

1. Instantiate a base model.
2. Create a configuration (`LoraConfig`) where you define LoRA-specific parameters.
3. Wrap the base model with `get_peft_model()` to get a trainable PeftModel.
4. Train the PeftModel as you normally would train the base model.

### `LoraConfig`


- $r$: the rank of the update matrices
- `lora_alpha`: LoRA scaling factor.
- `bias`: Specifies if the bias parameters should be trained.'none', 'all' or 'lora_only'.
- `use_rslora`: When set to True, uses Rank-Stabilized LoRA which sets the adapter scaling factor to $\frac{\text{lora_alpha}}{\sqrt(r)}$. Otherwise, $\frac{\text{lora_alpha}}{r}$.
- `modules_to_save`: List of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint. default modelâ€™s custom head.
- `layers_to_transform`: List of layers to be transformed by LoRA. If not specified, all layers in target_modules are transformed.
- `target_modules`

> **Query**: Controls what parts of the input the model focuses on.<br>
**Value**: Controls how the attended information is transformed

Check available modules:
```py
model = AutoModelForCausalLM.from_pretrained("your-model-name")
for name, module in model.named_modules():
    if any(x in name for x in ["query", "key", "value", "q_proj", "k_proj", "v_proj"]):
        print(name)
```

<br>

`init_lora_weights`:
- default Kaiming-uniform for weight A and initializing weight B as zeros.
- init_lora_weights="gaussian", initializing weight A with a Gaussian distribution .
- When quantizing the base model, e.g. for `QLoRA` training, consider using the `LoftQ` initialization,

```py
loftq_config = LoftQConfig(
    loftq_bits=4,   # Main quantization bit-width
    loftq_iter=1,   # Number of alternating optimization iterations
    loftq_recode_interval=1,    # How often to recode/update during optimization
    loftq_recode_ratio=0.25,    # Ratio of columns to recode in each iteration
    base_bits=None, # Base bits for non-quantized parts (None = use original precision)
    loftq_layers=None # Specific layers to apply LoftQ to (None = all applicable layers)
)
```

In [None]:
# !pip install bitsandbytes

In [None]:
from peft import LoftQConfig, LoraConfig, get_peft_model
from transformers import (
    AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, BitsAndBytesConfig
)


checkpoint = "distilgpt2"
# checkpoint = "openai-community/gpt2"
# bnb_config = BitsAndBytesConfig(load_in_8bit=True)
# base_model = AutoModelForCausalLM.from_pretrained(checkpoint, quantization_config=bnb_config)
base_model = AutoModelForCausalLM.from_pretrained(checkpoint)  # don't quantize here

In [95]:
# Check model modules
# for name, module in base_model.named_modules():
#     print(name, module)
#     print()

In [None]:
# loftq_config = LoftQConfig(loftq_bits=4)           # set 4bit quantization
lora_config = LoraConfig(
    r = 16,
    lora_alpha = 16,
    target_modules = ["c_attn", 'c_proj'],
    lora_dropout = 0.1,
    bias="none",
    task_type="CAUSAL_LM",
    # init_lora_weights = "loftq",
    # loftq_config=loftq_config
)

peft_model = get_peft_model(base_model, lora_config)
peft_model.print_trainable_parameters()

trainable params: 811,008 || all params: 82,723,584 || trainable%: 0.9804


In [97]:
# import torch
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# peft_model.to(device).device

In [98]:
# # Check model expectation
# print("Model forward signature:")
# print(base_model.forward.__doc__)

In [99]:
# Use language modeling data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer = tokenizer,
    mlm = False  # Not masked language modeling
)

In [None]:
batch_size = 16

args = TrainingArguments(
    output_dir = "./my_model",
    remove_unused_columns = True,
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = 5e-3,
    weight_decay = 0.01,
    per_device_train_batch_size = batch_size,
    gradient_accumulation_steps = 4,
    per_device_eval_batch_size = batch_size,
    fp16 = True,
    num_train_epochs = 5,
    logging_steps = 10,
    load_best_model_at_end = True,
)

In [None]:
trainer = Trainer(
    model = peft_model,
    args = args,
    train_dataset = dataset_clean['train'],
    eval_dataset = dataset_clean['validation'],
    processing_class = tokenizer,
    data_collator = data_collator,  # Add data collator for LM
)

trainer.train()

In [None]:
import math

results = trainer.evaluate()
print(f"Eval loss: {results['eval_loss']:.3f}")
print(f"Perplexity: {math.exp(results['eval_loss']):.2f}")

In [None]:
# # Use a pipeline as a high-level helper
# from transformers import pipeline

# pipe = pipeline("text-generation", model = peft_model)

In [None]:
from datasets import load_dataset

eli5 = load_dataset("dany0407/eli5_category", split="train[:5000]")
eli5 = eli5.train_test_split(test_size=0.2)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")

In [None]:
eli5 = eli5.flatten()
eli5

In [4]:
def preprocess_function(examples):
    return tokenizer([" ".join(x) for x in examples["answers.text"]])

In [None]:
tokenized_eli5 = eli5.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=eli5["train"].column_names,
)

In [6]:
block_size = 128


def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)

In [8]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [9]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
lm_dataset['train'][0]

In [None]:
training_args = TrainingArguments(
    output_dir="my_awesome_eli5_clm-model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    # push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()