In [3]:
import numpy as np
import pandas as pd

In [4]:
v = pd.read_csv("/content/hinglish_conversations.csv",
                on_bad_lines='skip',
                engine='python')


In [5]:
v.head(20)

Unnamed: 0,input,output
0,"kya yaar,traffic mein stuck ho gaya.","arre, mere ko bhi late hoga ab!"
1,kaunsa movie dekha tune?,"bhai, wo new one, bollywood wala."
2,aaj ka khana kya banaya?,"sabzi aur roti, usual stuff."
3,kal party hai na?,"haan yaar, tu bhi aana."
4,koi plans hai weekend ke?,"nahi yaar, just relaxing at home."
5,aapko kya problem hai?,"kuch nahi, bas tension hai."
6,kitne marks aaye?,"arre yaar, kuch nahi, pass to ho gaya."
7,kaunsa colour pasand hai?,"blue, woh ekdam calming hai."
8,chalo coffee peete hai.,"haan yaar, mujhe chai chahiye."
9,aap kya kar rahe ho?,"kya karu, time pass."


In [6]:
v.shape

(38262, 2)

In [7]:
v.to_csv("cleaned_dataset.csv", index=False)


In [8]:
s = pd.read_csv("/content/cleaned_dataset.csv")

In [9]:
s.shape

(38262, 2)

In [10]:
s.columns

Index(['input', 'output'], dtype='object')

In [11]:
s.isnull().sum()

Unnamed: 0,0
input,0
output,0


### LLM Fine-Tuning starts from here

In [12]:
# Converting Dataset into Training Form

In [13]:
import json

train_data = []

for i in range(len(v)):
    user = str(v.iloc[i]["input"])
    bot = str(v.iloc[i]["output"])

    text = f"User: {user}\nBot: {bot}"

    train_data.append({"text": text})

# Save to JSONL file
with open("train.jsonl", "w") as f:
    for item in train_data:
        f.write(json.dumps(item) + "\n")

print("train.jsonl")


train.jsonl


In [14]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "Abhishekcr448/Tiny-Hinglish-Chat-21M"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

print("Model + Tokenizer Loaded!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/875 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/141 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/124 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/144 [00:00<?, ?B/s]

Model + Tokenizer Loaded!


In [15]:
from datasets import load_dataset

# load JSONL
dataset = load_dataset("json", data_files="train.jsonl")

# tokenize
def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

tokenized_dataset = dataset.map(tokenize, batched=True)

# remove original text column
tokenized_dataset = tokenized_dataset.remove_columns(["text"])

# ADD LABELS
tokenized_dataset = tokenized_dataset.map(
    lambda batch: {"labels": batch["input_ids"]},
    batched=True
)

print("Tokenization + Labels ✔")
tokenized_dataset



Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/38262 [00:00<?, ? examples/s]

Map:   0%|          | 0/38262 [00:00<?, ? examples/s]

Tokenization + Labels ✔


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 38262
    })
})

In [16]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="hinglish-chatbot",
    per_device_train_batch_size=2,
    num_train_epochs=1,
    learning_rate=2e-5,
    logging_steps=50,
    save_steps=500,
    fp16=True,         # GPU speed boost
)


In [17]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
)


In [18]:
trainer.train()


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
50,0.496057
100,0.48301
150,0.484684
200,0.495348
250,0.431501
300,0.492605
350,0.432181
400,0.46357
450,0.462398
500,0.430062


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

TrainOutput(global_step=19131, training_loss=0.416317743101295, metrics={'train_runtime': 768.2517, 'train_samples_per_second': 49.804, 'train_steps_per_second': 24.902, 'total_flos': 232087727112192.0, 'train_loss': 0.416317743101295, 'epoch': 1.0})

In [1]:
# vhfh