In [None]:
!pip install --upgrade transformers accelerate datasets -q



In [None]:
from google.colab import drive
drive.mount('/content/drive')

## #preparing the training dataset

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split


df = pd.read_csv("/content/drive/MyDrive/LLM_RAG_Learning/project01/data/maintenance_logs.csv")


train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

dataset = DatasetDict({
    "train": train_dataset,
    "validation": eval_dataset
})

dataset


#tokenization

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "google/flan-t5-small"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


In [None]:
max_input_length = 256
max_target_length = 64

def preprocess_function(examples):
    inputs = ["summarize: " + text for text in examples["log_text"]]
    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        truncation=True
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["summary_text"],
            max_length=max_target_length,
            truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)
tokenized_datasets


In [None]:
from transformers import DataCollatorForSeq2Seq, TrainingArguments, Trainer

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

training_args = TrainingArguments(
    output_dir="../models/flan_t5_maintenance",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=5,
    logging_steps=10,
    push_to_hub=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=None,  # optional
)


trainer.train()


In [None]:
def summarize_log(text, max_new_tokens=50):
    input_text = "summarize: " + text
    inputs = tokenizer([input_text], return_tensors="pt", truncation=True)
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        num_beams=4
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

sample = df.iloc[0]["log_text"]
print("LOG:")
print(sample)
print("\nMODEL SUMMARY:")
print(summarize_log(sample))


In [None]:
model.save_pretrained("/content/drive/MyDrive/LLM_RAG_Learning/project01/models/flan_t5_maintenance")
tokenizer.save_pretrained("/content/drive/MyDrive/LLM_RAG_Learning/project01/models/flan_t5_maintenance")
