## Imports and installs

In [1]:
!pip install datasets evaluate tqdm -q
!pip install -U accelerate --quiet

[0m

In [2]:
import pandas as pd
import numpy as np
from transformers import (AutoTokenizer,
                          DataCollatorWithPadding, 
                          BloomTokenizerFast,
                          BloomForTokenClassification,
                          BloomForSequenceClassification,
                          DataCollatorForTokenClassification, 
                          AutoModelForTokenClassification,  
                          BloomForCausalLM,
                          TrainingArguments, Trainer,
                         pipeline)
from datasets import load_dataset, Dataset, concatenate_datasets
import torch
import os
import evaluate
import random
from tqdm import tqdm
import difflib

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [3]:
os.environ["WANDB_API_KEY"] = "174605229344dbcc2c90f595394111e3396b2b8b"

## Model : BLOOM-560m

In [4]:
model_path = "bigscience/bloomz-560m"
tokenizer = BloomTokenizerFast.from_pretrained(model_path)

Downloading (…)okenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

## Fine-tune the model
Since this kind of behavior isn't acceptable, let's fine-tune the model on the CrowS-Pairs dataset

### Preprocess the dataset

In [5]:
crows_pairs = pd.read_csv("/kaggle/input/a-dataset-for-measuring-social-biases-in-mlms/crows_pairs_anonymized.csv")

In [6]:
dataset = pd.DataFrame()

dataset['text'] = np.where(crows_pairs['stereo_antistereo'] == 'stereo', crows_pairs['sent_more'], crows_pairs['sent_less'])
dataset['answer'] = np.where(crows_pairs['stereo_antistereo'] == 'stereo', crows_pairs['sent_less'], crows_pairs['sent_more'])

dataset = Dataset.from_pandas(dataset)
dataset

Dataset({
    features: ['text', 'answer'],
    num_rows: 1508
})

### Get to work

In [7]:
dataset = dataset.shuffle(seed=2023)
dataset = dataset.train_test_split(test_size=0.05)

In [8]:
# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["text"], truncation=True), 
                                                                                 batched=True, remove_columns=["text", "answer"])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["answer"], truncation=True), 
                                                                                  batched=True, remove_columns=["text", "answer"])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

  0%|          | 0/2 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Max source length: 47


  0%|          | 0/2 [00:00<?, ?ba/s]

Max target length: 46


In [9]:
def get_word(words1, words2):
    # Find the differences between the two sentences
    diff = difflib.ndiff(words1, words2)

    # Initialize variables to store the results
    different_word1 = ""
    different_word2 = ""
    modified_sentence = ""

    # Iterate over the differences
    for item in diff:
        if item.startswith('- '):
            # Word present in sentence1 but not in sentence2
            if different_word1 != "":
                different_word1 = different_word1 + " " + item[2:]
            else:
                different_word1 = item[2:]
                modified_sentence += " [BLANK]"
        elif item.startswith('+ '):
            # Word present in sentence2 but not in sentence1
            if different_word2 != "":
                different_word2 = different_word2 + " " + item[2:]
            else:
                different_word2 = item[2:]
            modified_sentence += ""
        elif item.startswith(' '):
            # Word present in both sentences
            modified_sentence += " " + item[2:]
    
    return modified_sentence.strip(), different_word2.strip()
                

def preprocess_function(examples, padding="max_length"):
    template_start = "Fill in the following sentence. "
    inputs = []
    labels = []
    
    for text_item, answer_item in zip(examples["text"], examples["answer"]):
        input_sentence, diff_word = get_word(text_item.split(), answer_item.split())
        inputs.append(template_start + input_sentence)
        labels.append(diff_word)
        
    model_inputs = tokenizer(inputs, max_length=50, padding=padding, truncation=True)
    
    labels = tokenizer(labels, max_length=50, padding=padding, truncation=True)
    
    if padding == "max_length":
        labels["input_ids"] = [[(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]]
    
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    bleu = evaluate.load("bleu")
    return bleu.compute(predictions=predictions, references=labels)

In [10]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
tokenized_dataset = dataset.map(preprocess_function, batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [11]:
model = BloomForCausalLM.from_pretrained(model_path)

Downloading (…)lve/main/config.json:   0%|          | 0.00/715 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

In [12]:
training_args = TrainingArguments(
    output_dir="OutModelPolicy",
    learning_rate= 3e-05,
    per_device_train_batch_size=20,
    per_device_eval_batch_size=20, 
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    save_steps= 50000,
    eval_steps= 50000,
    fp16 = True,
    save_total_limit = 2, 
    push_to_hub=False,
)
# crashes with 35 batch size

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [14]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mmartinblanckaert[0m ([33mteam_bias[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.15.4
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20230619_130737-ypzm2dsj[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mvaliant-resonance-27[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/team_bias/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/team_bias/huggingface/runs/ypzm2dsj[0m
You're using a BloomTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


TrainOutput(global_step=180, training_loss=3.047684054904514, metrics={'train_runtime': 368.9069, 'train_samples_per_second': 19.409, 'train_steps_per_second': 0.488, 'total_flos': 649369337856000.0, 'train_loss': 3.047684054904514, 'epoch': 5.0})

In [15]:
trainer.save_model("politeBLOOM")

In [16]:
import shutil
shutil.make_archive('finetuned_bloomz', 'zip', '/kaggle/working/')

'/kaggle/working/finetuned_bloomz.zip'