In [None]:
pip install datasets pandas torch python-dotenv peft

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()
hf_token = os.getenv("HF_TOKEN")


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
login(token=hf_token)
model_name="distilgpt2"
tokenizer=AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
model=AutoModelForCausalLM.from_pretrained(model_name)


In [None]:
text ="ஒரு நாள்"
inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(inputs.input_ids, max_length=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
from datasets import load_dataset
raw_data = load_dataset("tniranjan/aitamilnadu_tamil_stories_no_instruct",split="train[:1000]")
data = raw_data.train_test_split(train_size=0.95)
data

In [None]:
tokenizer.pad_token = tokenizer.eos_token
def preprocess_batch(batch):
    inputs = tokenizer(batch["text"], truncation=True, padding=True, max_length=512)
    return inputs
tokenized_data = data.map(preprocess_batch, batched=True,batch_size=4,remove_columns=data["train"].column_names)

print(tokenized_data)

In [None]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
data_collator

In [None]:
from peft import get_peft_model, LoraConfig, TaskType

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
model.train()
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=1e-4)
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir="./output",
    save_steps=500,
    learning_rate=1e-5,
    weight_decay=0.04,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    logging_steps=50,
    logging_dir="./logs",
    resume_from_checkpoint=True
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    data_collator=data_collator,
    optimizers=(optimizer, None),
    tokenizer=tokenizer,
)
trainer.train()

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
trainer.save_model("gpt2-tamil-lora")
tokenizer.save_pretrained("gpt2-tamil-lora")

In [None]:
model= AutoModelForCausalLM.from_pretrained("gpt2-tamil-lora")
model

In [None]:
text = "ஒரு நாள்"
inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(inputs.input_ids, max_length=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))