In [None]:
!pip install transformers torch accelerate bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl (59.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.49.0


In [None]:
# Pehle zaroori libraries import karein
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig

# Model ka naam
model_name = "mistralai/Mistral-7B-Instruct-v0.2"

# --- Naya Quantization Config ---
# Yeh line deprecation warning khatam kar degi
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)

# Device check karein (GPU honi chahiye)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Model ko {device} par load kar rahe hain...")

# Model ko naye config ke saath load karein
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config, # <--- Yeh naya tariqa hai
    device_map="auto"
)

# Tokenizer load karein
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Pipeline banayein
chat_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

print("Model taiyar hai! Ab chat kar sakte hain.")

# Chat ke liye aik prompt tayyar karein
prompt = "[INST] Assalam o Alaikum! Aap kaun hain? [/INST]"

# Model se response generate karein
sequences = chat_pipeline(
    prompt,
    max_new_tokens=256,
    do_sample=True,
    temperature=0.7,
    return_full_text=False
)

# Model ka jawab print karein
print(f"Model ka Jawab: {sequences[0]['generated_text']}")

Model ko cuda par load kar rahe hain...


ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [None]:
# Yeh code hum ne pehle run kiya tha, model load karne ke liye
# Agar aapka model already load hai to is cell ko dobara run karne ki zaroorat nahi
import torch
from transformers import pipeline, BitsAndBytesConfig

# Agar aap ne pehle wala cell nahi chalaya to yeh part zaroori hai
# Model aur pipeline already loaded hain, to hum directly chat shuru kar sakte hain
# Agar nahi, to pehle wala cell run kar ke model load kar lein

# Maan lo aap ka 'chat_pipeline' pehle se hi ready hai
# Agar nahi hai to pehle wale cell mein se model load karne wala code run karein

print("****************************************************************")
print("Model se chat shuru karne ke liye yahan kuch likhein aur Enter dabayein.")
print("Chat band karne ke liye 'exit' type kar ke Enter dabayein.")
print("****************************************************************")

# --- YEH INTERACTIVE LOOP HAI ---
# Yeh loop bar bar chalega jab tak aap 'exit' na type karein
while True:
    # 1. User se input lein
    user_input = input(">>> Aap: ")

    # 2. Agar user 'exit' type kare to loop se bahir niklein
    if user_input.lower() == 'exit':
        print("Allah Hafiz! Chat band kar di gayi hai.")
        break

    # 3. User ka input Mistral ke format mein set karein
    # Mistral ko [INST] aur [/INST] ke darmiyan prompt chahiye
    prompt = f"[INST] {user_input} [/INST]"

    # 4. Model se response generate karein
    # Hum apne ready 'chat_pipeline' ka istemal kar rahe hain
    sequences = chat_pipeline(
        prompt,
        max_new_tokens=512,  # Tokens ki value badha di hai ta ke jawab poora mile
        do_sample=True,
        temperature=0.7,
        return_full_text=False # Sirf naya text chahiye, pura prompt nahi
    )

    # 5. Model ka jawab print karein
    print(f">>> Model: {sequences[0]['generated_text']}")
    print() # Ek blank line for better readability


****************************************************************
Model se chat shuru karne ke liye yahan kuch likhein aur Enter dabayein.
Chat band karne ke liye 'exit' type kar ke Enter dabayein.
****************************************************************


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


>>> Model:  Hello! How can I help you today? If you have any questions or need assistance with something, feel free to ask. I'm here to help. If you just want to chat or share some thoughts, I'd be happy to listen as well. Let me know what you have in mind.



Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


>>> Model:  I cannot provide you with the current weather in Bahawalnagar, Pakistan, as I don't have real-time abilities. However, I can help you find out by looking up the information online.

According to various weather sources, as of now, the weather in Bahawalnagar is mostly sunny with a temperature of around 35 degrees Celsius (95 degrees Fahrenheit). Please note that weather conditions can change frequently, so it's always a good idea to check a reliable weather forecasting website or application for the most up-to-date information.



Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


>>> Model:  Yes, I can provide information about the major religion in Pakistan. The majority of the population in Pakistan follows Islam. According to the latest statistics, around 97% of the population identifies as Muslims, making Pakistan one of the countries with the largest Muslim population in the world. The two largest Muslim denominations in Pakistan are the Sunni and the Shia, who make up the vast majority of the Muslim population. Other religions, such as Christianity, Hinduism, Sikhism, and Buddhism, are practiced by a small minority of the population.



You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


>>> Model:  The United States is made up of 50 states and each state is considered a "province" in a sense, as they are political subdivisions of the federal government. Therefore, there are 50 provinces, or states, in the United States. However, it's important to note that the term "province" is more commonly used in the context of countries like Canada, where there are 10 provinces and 3 territories. In the case of the United States, the term "state" is the more accurate and commonly used term.



In [None]:
# --- Part 1: Libraries Install Karein ---
# Yeh libraries fine-tuning ke liye zaroori hain
!pip install -q transformers datasets accelerate peft bitsandbytes trl

# --- Part 2: Required Tools Import Karein ---
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

# --- Part 3: Model aur Tokenizer Setup ---
# Model aur tokenizer ka naam
model_name = "mistralai/Mistral-7B-Instruct-v0.2"

# Dataset ka naam (jo file aapne upload ki hai)
dataset_name = "urdu_data.jsonl"

# Naya quantization config (jo pehle warning khatam karega)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)

# Model load karein
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)
model.config.use_cache = False # Training ke dauran cache na karein

# Tokenizer load karein
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token # Padding token set karein
tokenizer.padding_side = "right"

# --- Part 4: Data Load aur Format Karein ---
# Apni JSONL file se dataset load karein
dataset = load_dataset("json", data_files=dataset_name, split="train")

# Data ko model ke liye format karna (prompting function)
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['instruction'])):
        text = f"[INST] {example['instruction'][i]} [/INST] {example['output'][i]}"
        output_texts.append(text)
    return output_texts

# --- Part 5: LoRA Configuration ---
# LoRA (Low-Rank Adaptation) setup karein. Yeh model ke sirf hisson ko train karega.
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64, # Rank
    bias="none",
    task_type="CAUSAL_LM",
    # Yeh batate hain ke model ke hisson ko target karna hai
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)

# --- Part 6: Training Arguments ---
# Training ke parameters set karein
training_arguments = TrainingArguments(
    output_dir="./results", # Results yahan save honge
    num_train_epochs=1,     # Kitni dafa data repeat karna hai (1-2 kaafi hai)
    per_device_train_batch_size=4, # Ek baar mein kitne samples process karnay hain
    gradient_accumulation_steps=1, # Gradient accumulation
    optim="paged_adamw_32bit", # Optimizer
    save_steps=50,           # Har 50 steps baad model save karein
    logging_steps=10,        # Har 10 steps baad log karein
    learning_rate=2e-4,       # Learning rate
    weight_decay=0.001,
    fp16=False,              # bf16 use kar rahe hain
    bf16=True,
    max_grad_norm=0.3,
    max_steps=-1,             # -1 matlab num_train_epochs use karein
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
)

# --- Part 7: Trainer Setup ---
# Supersived Fine-Tuning Trainer setup karein
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="instruction", # Yeh field use karein text ke liye
    tokenizer=tokenizer,
    args=training_arguments,
    formatting_func=formatting_prompts_func, # Yeh function use karein formatting ke liye
    max_seq_length=512, # Maximum sequence length
    packing=False,
)

# --- Part 8: Training Shuru Karen! ---
print("Training shuru ho rahi hai... Yeh time le sakti hai.")
trainer.train()
print("Training mukammal!")

# --- Part 9: Fine-Tuned Adapter Save Karen ---
# Hum ne poora model nahi, sirf LoRA adapter ko train kiya hai
adapter_name = "urdu-mistral-adapter"
trainer.model.save_pretrained(adapter_name)
print(f"Adapter '{adapter_name}' ke naam se save ho gaya hai!")


In [None]:
from google.colab import drive
drive.mount('/content/drive')