In [None]:
!pip install --force-reinstall pydantic==1.10.6

In [None]:
import os
import pandas as pd
import transformers as tr
from datasets import load_dataset


In [None]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "########################"

In [None]:
pd.set_option('display.max_column', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_seq_items', None)
pd.set_option('display.max_colwidth', 500)
pd.set_option('expand_frame_repr', True)

In [None]:
import tempfile
tmpdir = tempfile.TemporaryDirectory()
local_training_root = tmpdir.name


In [None]:
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "9994"


In [None]:
imdb_ds = load_dataset("data.jsonl")

In [None]:
model_name = "google/gemma-7b"

In [None]:
tokenizer = tr.AutoTokenizer.from_pretrained(model_name, cache_dir=tmpdir)

In [None]:
import torch

def to_tokens(tokenizer, label_map):
    def apply(x):
        target_labels = [label_map[y] for y in x["label"]]
        token_res = tokenizer(
            x["text"],
            text_target=target_labels,
            return_tensors="pt",
            truncation=True,
            padding=True,
        )
        # Convert tensors to lists or numpy arrays
        for key, value in token_res.items():
            if isinstance(value, torch.Tensor):
                token_res[key] = value.tolist()
        return token_res
    return apply

# Create function to convert IMDb dataset to tokens
imdb_to_tokens = to_tokens(tokenizer=imdb_ds)

# Tokenize the IMDb dataset
tokenized_dataset = imdb_ds.map(
    imdb_to_tokens,
    batched=True,  # Expect the function to return a dictionary of types like (<class 'list'>, <class 'numpy.ndarray'>).
)

def test_tokenized_dataset(tokenized_dataset, num_samples=1):
    # Print the first few samples from the tokenized dataset
    for i in range(num_samples):
        sample = tokenized_dataset[i]
        print(f"Sample {i + 1}:")
        print("Input IDs:", sample["ids"])
        print("Labels:", sample["labels"])
        print("=" * 10)

# Test the tokenized dataset
test_tokenized_dataset(tokenized_dataset['labels'])

In [None]:
# Define Zero configuration for optimization
zero_config = {
    "zero_optimization": {
        "stage": 2,
        "offload_optimizer": {"device": "cpu", "pin_memory": True},
        "allgather_partitions": True,
        "allgather_bucket_size": 5e8,
        "overlap_comm": True,
        "reduce_scatter": True,
        "reduce_bucket_size": 5e8,
        "contiguous_gradients": True,
    },
    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "betas": "auto",
            "eps": "auto",
            "weight_decay": "auto",
            "torch_adam": True,
        },
    },
    "scheduler": {
        "type": "WarmupLR",
        "params": {
            "warmup_min_lr": "auto",
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto"
        }
    },
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto"
}

# Specify the model checkpoint to use
model_checkpoint = "base"

tokenizer = tr.AutoTokenizer.from_pretrained(
    model_checkpoint, cache_dir=tempfile
)

# Load the model for sequence-to-sequence learning
model = tr.AutoModelForSeq2SeqLM.from_pretrained(
    model_checkpoint, cache_dir=tempfile
)


In [None]:
# Define checkpoint name and location
checkpoint_name = "test"
checkpoint_location = os.path.join(local_training_root, checkpoint_name)

# Define training arguments
training_args = tr.TrainingArguments(
    checkpoint_location,
    num_train_epochs=120,
    per_device_train_batch_size=16,
    deepspeed=zero_config,
)


In [None]:
# Define data collator
data_collator = tr.DataCollatorWithPadding(tokenizer=tokenizer)

# Create compute instance
compute = tr.compute(
    model,
    training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)


In [None]:
# Train the model
compute.train()

# Persist the fine-tuned model to a directory
final_model_path = f"{tempfile}/{checkpoint_name}"
compute.save_model(output_dir=final_model_path)


In [None]:
# Load the fine-tuned model
fine_tuned_model = tr.AutoModelForSeq2SeqLM.from_pretrained(final_model_path)

# Generate predictions
inputs = tokenizer(
    return_tensors="pt",
    truncation=True,
    padding=True
)

prediction = fine_tuned_model.generate(
    system_prompt="Analyze the Sentimental analysis and score it. Also, perfrom the Writing Style Evaluation."
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"]
)

print(prediction)
