In [None]:
!pip install --upgrade transformers accelerate datasets -q



[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## #preparing the training dataset

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split


df = pd.read_csv("/content/drive/MyDrive/LLM_RAG_Learning/project01/data/maintenance_logs.csv")


train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

dataset = DatasetDict({
    "train": train_dataset,
    "validation": eval_dataset
})

dataset


DatasetDict({
    train: Dataset({
        features: ['log_text', 'summary_text', '__index_level_0__'],
        num_rows: 200
    })
    validation: Dataset({
        features: ['log_text', 'summary_text', '__index_level_0__'],
        num_rows: 50
    })
})

#tokenization

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "google/flan-t5-small"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
max_input_length = 256
max_target_length = 64

def preprocess_function(examples):
    inputs = ["summarize: " + text for text in examples["log_text"]]
    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        truncation=True
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["summary_text"],
            max_length=max_target_length,
            truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)
tokenized_datasets


Map:   0%|          | 0/200 [00:00<?, ? examples/s]



Map:   0%|          | 0/50 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['log_text', 'summary_text', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 200
    })
    validation: Dataset({
        features: ['log_text', 'summary_text', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 50
    })
})

In [None]:
from transformers import DataCollatorForSeq2Seq, TrainingArguments, Trainer

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

training_args = TrainingArguments(
    output_dir="../models/flan_t5_maintenance",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=5,
    logging_steps=10,
    push_to_hub=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=None,  # optional
)


trainer.train()


  trainer = Trainer(
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmanisha-jaiswal026[0m ([33mmanisha-jaiswal026-mercedes-benz-research-development-india[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss
1,3.2434,2.569455
2,2.8369,1.995499
3,2.2442,1.711153
4,2.0488,1.556589
5,2.1575,1.514767




TrainOutput(global_step=250, training_loss=2.639717445373535, metrics={'train_runtime': 4667.7039, 'train_samples_per_second': 0.214, 'train_steps_per_second': 0.054, 'total_flos': 12655077015552.0, 'train_loss': 2.639717445373535, 'epoch': 5.0})

In [None]:
def summarize_log(text, max_new_tokens=50):
    input_text = "summarize: " + text
    inputs = tokenizer([input_text], return_tensors="pt", truncation=True)
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        num_beams=4
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

sample = df.iloc[0]["log_text"]
print("LOG:")
print(sample)
print("\nMODEL SUMMARY:")
print(summarize_log(sample))


LOG:
[Shift 1] Shift C: Compressor 1 discharge temperature increased steadily to 140°C. Vibration baseline shifted upward. Oil level slightly below minimum mark.

MODEL SUMMARY:
Compressor 1 discharge shifted to 140°C. Vibration shifted upward. Oil level below minimum mark.


In [None]:
model.save_pretrained("/content/drive/MyDrive/LLM_RAG_Learning/project01/models/flan_t5_maintenance")
tokenizer.save_pretrained("/content/drive/MyDrive/LLM_RAG_Learning/project01/models/flan_t5_maintenance")


('/content/drive/MyDrive/LLM_RAG_Learning/project01/models/flan_t5_maintenance/tokenizer_config.json',
 '/content/drive/MyDrive/LLM_RAG_Learning/project01/models/flan_t5_maintenance/special_tokens_map.json',
 '/content/drive/MyDrive/LLM_RAG_Learning/project01/models/flan_t5_maintenance/spiece.model',
 '/content/drive/MyDrive/LLM_RAG_Learning/project01/models/flan_t5_maintenance/added_tokens.json',
 '/content/drive/MyDrive/LLM_RAG_Learning/project01/models/flan_t5_maintenance/tokenizer.json')