In [None]:
from transformers import AutoModelForCausalLM, GemmaConfig, AutoTokenizer, AutoModel, MistralConfig, MistralModel, MistralForCausalLM, LlamaConfig, LlamaForCausalLM
import torch
import torch.nn as nn
import json
import pandas as pd

In [None]:
tokenizer = AutoTokenizer.from_pretrained("NeerjaK/Urdu_Model")
tokenizer.pad_token = tokenizer.eos_token

config = LlamaConfig(hidden_size=256,
                     vocab_size=len(tokenizer.vocab),
                     num_attention_heads=4,
                     num_key_value_heads=2,
                     num_hidden_layers=12,
                     intermediate_size=688,
                     eos_token_id = 2,
                     bos_token_id = 1,
                     max_position_embeddings=64)
config

LlamaConfig {
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 688,
  "max_position_embeddings": 64,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "num_key_value_heads": 2,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.46.2",
  "use_cache": true,
  "vocab_size": 32769
}

In [None]:
model_mis = LlamaForCausalLM(config)
model_mis.save_pretrained("urdu_model")
tokenizer.save_pretrained("urdu_model")

('urdu_model/tokenizer_config.json',
 'urdu_model/special_tokens_map.json',
 'urdu_model/tokenizer.json')

In [None]:
!pip install datasets



In [None]:
from huggingface_hub import login

login("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")


In [None]:
# Specify your dataset repository and list all relevant files
from datasets import load_dataset, concatenate_datasets

# Define your repo and data files
repo_id = "NeerjaK/Urdu_Model"
data_files = {
    "train": [
        "bbc_dataset_token_train.parquet",
        "jang_dataset_120000.parquet",
        "jang_dataset_60000.parquet",
        "jang_dataset_90000.parquet",
        "jang_dataset_token_train.parquet",
        "jang_dataset_200000.parquet"
    ],
    "test": [
        "news18_dataset_token_test.parquet",
        "bbc_dataset_token_test.parquet",

    ],
}

# Step 1: Load all train datasets
train_datasets = [
    load_dataset(repo_id, data_files={"train": file}, split="train")
    for file in data_files["train"]
]

# Step 2: Load all test datasets
test_datasets = [
    load_dataset(repo_id, data_files={"test": file}, split="test")
    for file in data_files["test"]
]

# Step 3: Concatenate the datasets
train_dataset = concatenate_datasets(train_datasets)
eval_dataset = concatenate_datasets(test_datasets)

# Step 4: Print info to verify
print(f"Train dataset size: {len(train_dataset)}")
print(f"Eval dataset size: {len(eval_dataset)}")


Train dataset size: 269321
Eval dataset size: 9702


In [None]:
import math
from transformers import TrainerCallback

class SaveModelCallback(TrainerCallback):
    def __init__(self, repo_id, log_interval=0.1, log_file="perplexity_log.txt"):
        self.repo_id = repo_id
        self.log_interval = log_interval
        self.steps_per_interval = None
        self.perplexity_log = []  # Store perplexity values
        self.log_file = log_file

    def on_train_begin(self, args, state, control, **kwargs):
        # Calculate steps per epoch
        steps_per_epoch = state.max_steps / args.num_train_epochs
        # Calculate steps for the specified interval (e.g., every 0.1 epoch)
        self.steps_per_interval = int(steps_per_epoch * self.log_interval)

    def on_log(self, args, state, control, logs=None, **kwargs):
        # Track perplexity at every log_interval (e.g., 0.1 epoch)
        if self.steps_per_interval and state.global_step % self.steps_per_interval == 0:
            loss = logs.get("loss")
            if loss is not None:
                perplexity = math.exp(loss) if loss < 100 else float('inf')  # Prevent overflow
                print(f"Perplexity at step {state.global_step}: {perplexity:.4f}")

                # Save perplexity to list
                self.perplexity_log.append((state.global_step, perplexity))

    def on_save(self, args, state, control, model=None, tokenizer=None, **kwargs):
        # Saving model and tokenizer to Hugging Face Hub
        print(f"Saving model checkpoint at step {state.global_step}")
        model.push_to_hub(commit_message="Saving checkpoint", repo_id=self.repo_id)
        # tokenizer.push_to_hub(commit_message="Saving tokenizer", repo_id=self.repo_id)

        # Save perplexity log to a file
        if self.perplexity_log:
            with open(self.log_file, "a") as f:
                for step, perplexity in self.perplexity_log:
                    f.write(f"Step {step}: Perplexity {perplexity:.4f}\n")
            self.perplexity_log = []  # Clear log after saving


In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./urdu_model",
    overwrite_output_dir=True,
    num_train_epochs=10,
    logging_steps=1,
    learning_rate=2e-3,
    fp16=True,
    do_train=True,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    save_steps=100,
    save_total_limit=2,
    report_to="none",
)

trainer = Trainer(
    model=model_mis,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

callback = SaveModelCallback(repo_id="NeerjaK/NLP-Assignment2")
trainer.add_callback(callback)



  trainer = Trainer(


In [None]:
trainer.train()

Step,Training Loss
1,10.3309
2,10.1398
3,9.7766
4,9.3846
5,9.2871
6,8.8278
7,8.5462
8,8.4709
9,8.1995
10,7.9943


Saving model checkpoint at step 100
Saving model checkpoint at step 200


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 300


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 400


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 500


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 600


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 700


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 800


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 900


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 1000


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 1100


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 1200


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 1300


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 1400


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 1500


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 1600


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 1700


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 1800


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 1900


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 2000


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 2100


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 2200


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 2300


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 2400


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 2500


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 2600


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 2700


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 2800


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 2900


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 3000


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 3100


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 3200


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 3300


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 3400


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 3500


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 3600


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 3700


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 3800


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 3900


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 4000


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 4100


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 4200


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 4300


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 4400


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 4500


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 4600


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 4700


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 4800


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 4900


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 5000


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 5100


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 5200


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 5300


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 5400


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 5500


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 5600


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 5700


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 5800


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 5900


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 6000


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 6100


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 6200


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 6300


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 6400


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 6500


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 6600


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 6700


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Perplexity at step 6733: 188.9533
Saving model checkpoint at step 6800


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 6900


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 7000


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 7100


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 7200


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 7300


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Saving model checkpoint at step 7400


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Step,Training Loss
1,10.3309
2,10.1398
3,9.7766
4,9.3846
5,9.2871
6,8.8278
7,8.5462
8,8.4709
9,8.1995
10,7.9943


Saving model checkpoint at step 7500


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

In [None]:
#### if model is partially trained then use this
from transformers import AutoModelForSequenceClassification, AutoTokenizer

repo_id = "NeerjaK/Urdu_Model"

model_mis = AutoModelForSequenceClassification.from_pretrained(repo_id)

tokenizer = AutoTokenizer.from_pretrained(repo_id)

trainer.train(resume_from_checkpoint=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/669 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at NeerjaK/Urdu_Model and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/3.26k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.57M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

ValueError: No valid checkpoint found in output directory (tmp_trainer)

In [None]:
trainer.save_model("trained_urdu_model")
tokenizer.save_pretrained("trained_urdu_model")

('trained_urdu_model/tokenizer_config.json',
 'trained_urdu_model/special_tokens_map.json',
 'trained_urdu_model/tokenizer.json')

In [None]:
from datasets import Dataset

In [None]:
custom_input = "خاتون کرسی سے گر کر جاں بحق"
input_dict = {'text': [custom_input]}
input_dict = {'input_ids': [tokenizer.encode(custom_input)]}
custom_dataset = Dataset.from_dict(input_dict)
predictions = trainer.predict(custom_dataset)
generated_outputs = predictions.predictions  # This will be logits
output_ids = torch.argmax(torch.tensor(generated_outputs), dim=2)
tokenizer.decode(output_ids[0])

Step,Training Loss
25001,5.5252
25002,4.8929
25003,4.7591
25004,5.2176
25005,4.6991
25006,4.99
25007,5.0783
25008,4.4111
25009,5.1926
25010,4.6025


'<pad><pad><pad><pad><pad><pad><pad>'

In [None]:
%pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
from datasets import Dataset
import torch

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Define the checkpoint directory
checkpoint_path = "."  # Location where the model files are stored

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)

# Load model (Generative Model)
model = AutoModelForCausalLM.from_pretrained(checkpoint_path)

# Prepare custom input for prediction
custom_input = 'کرسی سے گر کر خاتون کی موت'

# Tokenize the input
input_ids = tokenizer.encode(custom_input, return_tensors="pt")

# Generate text
output_ids = model.generate(
    input_ids,
    max_length=50,  # Maximum length of the generated sequence
    num_beams=5,  # Beam search for diverse predictions
    no_repeat_ngram_size=2,  # Avoid repeating n-grams
    early_stopping=True  # Stop when all beams finish
)

# Decode the generated output
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Print the output
print("Generated Text:", generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generated Text: کرسی سے گر کر خاتون کی موت کے لیے میں ان کے گھر میں سے ایک میں  افراد ہلاک ہوئے تھے اور  رنز سے شکست حاصل کی گئی تھی کہ اس کی وجہ سے  لاکھ روپے سے بڑھ کر 
