In [1]:
!pip install transformers datasets accelerate


Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
     -------------------------------------- 487.4/487.4 kB 4.3 MB/s eta 0:00:00
Collecting accelerate
  Downloading accelerate-1.5.2-py3-none-any.whl (345 kB)
     -------------------------------------- 345.1/345.1 kB 7.1 MB/s eta 0:00:00
Collecting pyarrow>=15.0.0
  Downloading pyarrow-19.0.1-cp310-cp310-win_amd64.whl (25.3 MB)
     ---------------------------------------- 25.3/25.3 MB 4.2 MB/s eta 0:00:00
Collecting xxhash
  Downloading xxhash-3.5.0-cp310-cp310-win_amd64.whl (30 kB)
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.29.3-py3-none-any.whl (468 kB)
     -------------------------------------- 469.0/469.0 kB 4.2 MB/s eta 0:00:00
Collecting tqdm>=4.27
  Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
     ---------------------------------------- 78.5/78.5 kB 4.3 MB/s eta 0:00:00
Collecting aiohttp
  Downloading aiohttp-3.11.14-cp310-cp310-win_amd64.whl (442 kB)
     ----------

In [2]:
# Step 2: Import Necessary Libraries
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
import torch

In [3]:
# Step 3: Load Dataset
data_file = "output.json"  # Path to your uploaded dataset (adjust if necessary)
dataset = load_dataset("json", data_files=data_file)

Generating train split: 0 examples [00:00, ? examples/s]

In [4]:
# Split dataset into training and validation sets
dataset = dataset["train"].train_test_split(test_size=0.1)
train_data = dataset["train"]
val_data = dataset["test"]

In [5]:
# Step 4: Load Model and Tokenizer
model_name = "distilgpt2"  # Small and efficient model for CPU-based fine-tuning
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)



tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

In [6]:
# Add a padding token if not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

Using pad_token, but it is not set yet.


In [7]:
# Step 5: Preprocess Dataset
def preprocess_function(examples):
    inputs = [f"Context: {c}\nQuestion: {q}" for c, q in zip(examples["Context"], examples["Question"])]
    outputs = examples["Answer"]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(outputs, max_length=128, truncation=True, padding="max_length")["input_ids"]

    # Replace padding tokens in labels with -100
    labels = [
        [-100 if token == tokenizer.pad_token_id else token for token in label_seq]
        for label_seq in labels
    ]
    model_inputs["labels"] = labels
    return model_inputs


In [8]:
# Tokenize the datasets
train_dataset = train_data.map(preprocess_function, batched=True)
val_dataset = val_data.map(preprocess_function, batched=True)


Map:   0%|          | 0/8987 [00:00<?, ? examples/s]

Map:   0%|          | 0/999 [00:00<?, ? examples/s]

In [9]:
training_args = TrainingArguments(
    output_dir="./fine_tuned_distilgpt2",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    save_strategy="epoch",
    save_total_limit=2,
    logging_steps=10,
    fp16=torch.cuda.is_available(),  # Use FP16 if a GPU is available
    report_to="none",
    no_cuda=not torch.cuda.is_available()  # Ensure no CUDA if running on CPU
)

In [10]:
# Define data collator
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, padding=True, return_tensors="pt")


In [11]:
# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [12]:
# Step 8: Fine-Tune the Model
trainer.train()

The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: Context, Question, Answer. If Context, Question, Answer are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 8987
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 6741
  Number of trainable parameters = 81912576
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


ValueError: Expected input batch_size (2044) to match target batch_size (508).

In [13]:
print("Sample input_ids:", train_dataset[0]["input_ids"])
print("Sample labels:", train_dataset[0]["labels"])
print("Input length:", len(train_dataset[0]["input_ids"]))
print("Label length:", len(train_dataset[0]["labels"]))


Sample input_ids: [21947, 25, 1654, 24631, 479, 44844, 1222, 281, 81, 13, 3691, 13, 1181, 286, 334, 13, 79, 11207, 1160, 400, 46593, 346, 33448, 198, 24361, 25, 644, 2597, 750, 262, 9709, 286, 279, 86, 16, 290, 279, 86, 17, 711, 287, 262, 17700, 2184, 338, 2551, 30, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256

In [14]:
def preprocess_function(examples):
    # Combine context and question into a single input
    inputs = [f"Context: {context}\nQuestion: {question}" for context, question in zip(examples["Context"], examples["Question"])]
    outputs = examples["Answer"]

    # Tokenize inputs and outputs
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(outputs, max_length=512, truncation=True, padding="max_length")["input_ids"]

    # Replace padding tokens in labels with -100 to ignore them during loss calculation
    labels = [
        [-100 if token == tokenizer.pad_token_id else token for token in label_seq]
        for label_seq in labels
    ]

    # Ensure input_ids and labels have the same length
    model_inputs["labels"] = labels
    return model_inputs


In [15]:
train_dataset = train_data.map(preprocess_function, batched=True)
val_dataset = val_data.map(preprocess_function, batched=True)


Map:   0%|          | 0/8987 [00:00<?, ? examples/s]

Map:   0%|          | 0/999 [00:00<?, ? examples/s]

In [16]:
print("Sample input_ids:", train_dataset[0]["input_ids"])
print("Sample labels:", train_dataset[0]["labels"])
print("Input length:", len(train_dataset[0]["input_ids"]))
print("Label length:", len(train_dataset[0]["labels"]))


Sample input_ids: [21947, 25, 1654, 24631, 479, 44844, 1222, 281, 81, 13, 3691, 13, 1181, 286, 334, 13, 79, 11207, 1160, 400, 46593, 346, 33448, 198, 24361, 25, 644, 2597, 750, 262, 9709, 286, 279, 86, 16, 290, 279, 86, 17, 711, 287, 262, 17700, 2184, 338, 2551, 30, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256

In [17]:
from transformers import DataCollatorForSeq2Seq

# Define data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, padding=True, return_tensors="pt")


In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator  # Add this to ensure proper padding
)


In [19]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: Context, Question, Answer. If Context, Question, Answer are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 8987
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 6741
  Number of trainable parameters = 81912576


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [22]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
import torch

# Load dataset
data_file = "output.json"
dataset = load_dataset("json", data_files=data_file)
dataset = dataset["train"].train_test_split(test_size=0.1)
train_data = dataset["train"]
val_data = dataset["test"]

# Load tokenizer and model
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Add padding token if needed
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Preprocessing function
def preprocess_function(examples):
    inputs = [f"Context: {c}\nQuestion: {q}" for c, q in zip(examples["Context"], examples["Question"])]
    outputs = examples["Answer"]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(outputs, max_length=128, truncation=True, padding="max_length")["input_ids"]

    # Replace padding tokens in labels with -100
    labels = [
        [-100 if token == tokenizer.pad_token_id else token for token in label_seq]
        for label_seq in labels
    ]
    model_inputs["labels"] = labels
    return model_inputs

# Apply preprocessing
train_dataset = train_data.map(preprocess_function, batched=True).with_format("torch")
val_dataset = val_data.map(preprocess_function, batched=True).with_format("torch")

# LoRA configuration (optional)
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

# Freeze non-critical layers (optional)
for param in model.base_model.parameters():
    param.requires_grad = False

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, padding=True, return_tensors="pt")

# Training arguments
training_args = TrainingArguments(
    output_dir="./fine_tuned_distilgpt2",
    evaluation_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=1,  # Reduce batch size
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,  # Simulate larger batch size
    num_train_epochs=1,             # Start with 1 epoch for debugging
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    logging_steps=10,
    fp16=torch.cuda.is_available(),
    no_cuda=not torch.cuda.is_available(),
    report_to="none"
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Fine-tune the model
trainer.train()

# Save model
model.save_pretrained("./fine_tuned_distilgpt2")
tokenizer.save_pretrained("./fine_tuned_distilgpt2")
print("Fine-tuning completed!")


ImportError: cannot import name 'Cache' from 'transformers' (C:\Users\user-pc\anaconda3\lib\site-packages\transformers\__init__.py)

In [21]:
!pip install peft


Collecting peft
  Downloading peft-0.15.0-py3-none-any.whl (410 kB)
     -------------------------------------- 410.8/410.8 kB 3.2 MB/s eta 0:00:00
Installing collected packages: peft
Successfully installed peft-0.15.0


In [23]:
!pip install --upgrade transformers


Collecting transformers
  Downloading transformers-4.50.0-py3-none-any.whl (10.2 MB)
     ---------------------------------------- 10.2/10.2 MB 4.1 MB/s eta 0:00:00
Collecting tokenizers<0.22,>=0.21
  Downloading tokenizers-0.21.1-cp39-abi3-win_amd64.whl (2.4 MB)
     ---------------------------------------- 2.4/2.4 MB 7.7 MB/s eta 0:00:00
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.11.4
    Uninstalling tokenizers-0.11.4:
      Successfully uninstalled tokenizers-0.11.4
  Attempting uninstall: transformers
    Found existing installation: transformers 4.24.0
    Uninstalling transformers-4.24.0:
      Successfully uninstalled transformers-4.24.0
Successfully installed tokenizers-0.21.1 transformers-4.50.0


In [25]:
!pip install --upgrade peft transformers




In [26]:
import transformers
import peft

print("Transformers version:", transformers.__version__)
print("PEFT version:", peft.__version__)



ImportError: cannot import name 'Cache' from 'transformers' (C:\Users\user-pc\anaconda3\lib\site-packages\transformers\__init__.py)

In [27]:
pip uninstall -y transformers peft


Found existing installation: transformers 4.50.0
Uninstalling transformers-4.50.0:
  Successfully uninstalled transformers-4.50.0
Found existing installation: peft 0.15.0
Uninstalling peft-0.15.0:
  Successfully uninstalled peft-0.15.0
Note: you may need to restart the kernel to use updated packages.


In [28]:
pip install transformers peft


Collecting transformers
  Using cached transformers-4.50.0-py3-none-any.whl (10.2 MB)
Collecting peft
  Using cached peft-0.15.0-py3-none-any.whl (410 kB)
Installing collected packages: transformers, peft
Successfully installed peft-0.15.0 transformers-4.50.0
Note: you may need to restart the kernel to use updated packages.


In [35]:
import transformers
import peft

print("Transformers version:", transformers.__version__)
print("PEFT version:", peft.__version__)


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "C:\Users\user-pc\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3460, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\user-pc\AppData\Local\Temp\ipykernel_17400\2994132022.py", line 2, in <module>
    import peft
  File "C:\Users\user-pc\anaconda3\lib\site-packages\peft\__init__.py", line 22, in <module>
    from .auto import (
  File "C:\Users\user-pc\anaconda3\lib\site-packages\peft\auto.py", line 30, in <module>
    from .mapping import MODEL_TYPE_TO_PEFT_MODEL_MAPPING
ImportError: cannot import name 'MODEL_TYPE_TO_PEFT_MODEL_MAPPING' from 'peft.mapping' (C:\Users\user-pc\anaconda3\lib\site-packages\peft\mapping.py)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\user-pc\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2057, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
  

In [31]:
!pip install git+https://github.com/huggingface/transformers.git
!pip install git+https://github.com/huggingface/peft.git


Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to c:\users\user-pc\appdata\local\temp\pip-req-build-xhppz3gk
  Resolved https://github.com/huggingface/transformers.git to commit 2b8a15cc3f1a0c94cf817a8fd8c87bca28737e09
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml): started
  Building wheel for transformers (pyproject.toml): still running...
  Building wheel for transformers (pyproject.toml): still running...
  Building wheel for transformers (pyproject.toml): finished with status 'done'
  Created wheel for transformers: filename=tra

  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git 'C:\Users\user-pc\AppData\Local\Temp\pip-req-build-xhppz3gk'


Collecting git+https://github.com/huggingface/peft.git

  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/peft.git 'C:\Users\user-pc\AppData\Local\Temp\pip-req-build-z7lhs54f'



  Cloning https://github.com/huggingface/peft.git to c:\users\user-pc\appdata\local\temp\pip-req-build-z7lhs54f
  Resolved https://github.com/huggingface/peft.git to commit e5e7b73fcf7f0d9fc370adbb016390454ef6eb09
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: peft
  Building wheel for peft (pyproject.toml): started
  Building wheel for peft (pyproject.toml): finished with status 'done'
  Created wheel for peft: filename=peft-0.15.1.dev0-py3-none-any.whl size=413551 sha256=225701765f6185c4a05ed32966a1e91113af1d8cc8a52e6bf29de63b0b041f19
  Stored in directory: C:\Users\user-pc\AppData\Local\Temp\pip-ephem-wheel-cache-fmlg4ru0\wheels\d7\c7\de\1368fac8590e1b103ddc2ec2a28

In [33]:
pip uninstall -y transformers peft


Found existing installation: transformers 4.51.0.dev0
Uninstalling transformers-4.51.0.dev0:
  Successfully uninstalled transformers-4.51.0.dev0
Found existing installation: peft 0.15.1.dev0
Uninstalling peft-0.15.1.dev0:
  Successfully uninstalled peft-0.15.1.dev0
Note: you may need to restart the kernel to use updated packages.


In [34]:
pip install transformers==4.31.0 peft==0.4.0


Collecting transformers==4.31.0
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
     ---------------------------------------- 7.4/7.4 MB 4.4 MB/s eta 0:00:00
Collecting peft==0.4.0
  Downloading peft-0.4.0-py3-none-any.whl (72 kB)
     ---------------------------------------- 72.9/72.9 kB 3.9 MB/s eta 0:00:00
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-win_amd64.whl (3.5 MB)
     ---------------------------------------- 3.5/3.5 MB 6.7 MB/s eta 0:00:00
Installing collected packages: tokenizers, transformers, peft
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.21.1
    Uninstalling tokenizers-0.21.1:
      Successfully uninstalled tokenizers-0.21.1
Successfully installed peft-0.4.0 tokenizers-0.13.3 transformers-4.31.0
Note: you may need to restart the kernel to use updated packages.


In [36]:
pip uninstall -y transformers peft


Found existing installation: transformers 4.31.0
Uninstalling transformers-4.31.0:
  Successfully uninstalled transformers-4.31.0
Found existing installation: peft 0.4.0
Uninstalling peft-0.4.0:
  Successfully uninstalled peft-0.4.0
Note: you may need to restart the kernel to use updated packages.


In [37]:
pip cache purge


Files removed: 162
Note: you may need to restart the kernel to use updated packages.


In [38]:
pip install transformers==4.31.0 peft==0.4.0


Collecting transformers==4.31.0
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
     ---------------------------------------- 7.4/7.4 MB 4.9 MB/s eta 0:00:00
Collecting peft==0.4.0
  Downloading peft-0.4.0-py3-none-any.whl (72 kB)
     ---------------------------------------- 72.9/72.9 kB 3.9 MB/s eta 0:00:00
Installing collected packages: transformers, peft
Successfully installed peft-0.4.0 transformers-4.31.0
Note: you may need to restart the kernel to use updated packages.


In [39]:
import transformers
import peft

print("Transformers version:", transformers.__version__)
print("PEFT version:", peft.__version__)


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "C:\Users\user-pc\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3460, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\user-pc\AppData\Local\Temp\ipykernel_17400\2994132022.py", line 2, in <module>
    import peft
  File "C:\Users\user-pc\anaconda3\lib\site-packages\peft\__init__.py", line 22, in <module>
    from .auto import (
  File "C:\Users\user-pc\anaconda3\lib\site-packages\peft\auto.py", line 30, in <module>
    from .mapping import MODEL_TYPE_TO_PEFT_MODEL_MAPPING
ImportError: cannot import name 'MODEL_TYPE_TO_PEFT_MODEL_MAPPING' from 'peft.mapping' (C:\Users\user-pc\anaconda3\lib\site-packages\peft\mapping.py)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\user-pc\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2057, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
  

In [40]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from datasets import load_dataset

# Load dataset
data_file = "output.json"
dataset = load_dataset("json", data_files=data_file)
dataset = dataset["train"].train_test_split(test_size=0.1)
train_data = dataset["train"]
val_data = dataset["test"]

# Load tokenizer and model
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Add padding token if needed
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Preprocessing function
def preprocess_function(examples):
    inputs = [f"Context: {c}\nQuestion: {q}" for c, q in zip(examples["Context"], examples["Question"])]
    outputs = examples["Answer"]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(outputs, max_length=128, truncation=True, padding="max_length")["input_ids"]

    # Replace padding tokens in labels with -100
    labels = [
        [-100 if token == tokenizer.pad_token_id else token for token in label_seq]
        for label_seq in labels
    ]
    model_inputs["labels"] = labels
    return model_inputs

# Apply preprocessing
train_dataset = train_data.map(preprocess_function, batched=True).with_format("torch")
val_dataset = val_data.map(preprocess_function, batched=True).with_format("torch")

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, padding=True, return_tensors="pt")

# Training arguments
training_args = TrainingArguments(
    output_dir="./fine_tuned_distilgpt2",
    evaluation_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=1,  # Reduce batch size for memory constraints
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,  # Simulate larger batch size
    num_train_epochs=3,             # Number of epochs
    save_strategy="epoch",
    save_total_limit=2,
    logging_steps=10,
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU is available
    no_cuda=not torch.cuda.is_available(),  # Use CPU if no GPU is available
    report_to="none"
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_distilgpt2")
tokenizer.save_pretrained("./fine_tuned_distilgpt2")
print("Fine-tuning completed!")


loading configuration file config.json from cache at C:\Users\user-pc/.cache\huggingface\hub\models--distilgpt2\snapshots\2290a62682d06624634c1f46a6ad5be0f47f38aa\config.json
Model config GPT2Config {
  "_name_or_path": "distilgpt2",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_spec

Map:   0%|          | 0/8987 [00:00<?, ? examples/s]

Map:   0%|          | 0/999 [00:00<?, ? examples/s]

PyTorch: setting up devices
The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: Context, Question, Answer. If Context, Question, Answer are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 8987
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 8
  Total optimization steps = 3369
  Number of trainable parameters = 81912576
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


ValueError: Expected input batch_size (511) to match target batch_size (127).

In [41]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
import torch

# Load dataset
data_file = "output.json"
dataset = load_dataset("json", data_files=data_file)
dataset = dataset["train"].train_test_split(test_size=0.1)
train_data = dataset["train"]
val_data = dataset["test"]

# Load tokenizer and model
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Add padding token if needed
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Preprocessing function
def preprocess_function(examples):
    inputs = [f"Context: {c}\nQuestion: {q}" for c, q in zip(examples["Context"], examples["Question"])]
    outputs = examples["Answer"]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
    labels = tokenizer(outputs, max_length=64, truncation=True, padding="max_length")["input_ids"]
    labels = [[-100 if token == tokenizer.pad_token_id else token for token in label_seq] for label_seq in labels]
    model_inputs["labels"] = labels
    return model_inputs

# Apply preprocessing
train_dataset = train_data.map(preprocess_function, batched=True).with_format("torch")
val_dataset = val_data.map(preprocess_function, batched=True).with_format("torch")

# LoRA configuration (optional for low-compute)
lora_config = LoraConfig(
    r=8, lora_alpha=16, target_modules=["q_proj", "v_proj"], lora_dropout=0.1, task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

# Freeze non-critical layers to save computation
for param in model.base_model.parameters():
    param.requires_grad = False

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, padding=True, return_tensors="pt")

# Training arguments
training_args = TrainingArguments(
    output_dir="./fine_tuned_distilgpt2",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=2,
    save_strategy="epoch",
    logging_steps=10,
    fp16=torch.cuda.is_available(),
    no_cuda=not torch.cuda.is_available(),
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Train model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_distilgpt2")
tokenizer.save_pretrained("./fine_tuned_distilgpt2")


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "C:\Users\user-pc\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3460, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\user-pc\AppData\Local\Temp\ipykernel_17400\1938489842.py", line 3, in <module>
    from peft import LoraConfig, get_peft_model
  File "C:\Users\user-pc\anaconda3\lib\site-packages\peft\__init__.py", line 22, in <module>
    from .auto import (
  File "C:\Users\user-pc\anaconda3\lib\site-packages\peft\auto.py", line 30, in <module>
    from .mapping import MODEL_TYPE_TO_PEFT_MODEL_MAPPING
ImportError: cannot import name 'MODEL_TYPE_TO_PEFT_MODEL_MAPPING' from 'peft.mapping' (C:\Users\user-pc\anaconda3\lib\site-packages\peft\mapping.py)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\user-pc\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2057, in showtraceback
    stb = self.Intera

In [42]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from datasets import load_dataset
import torch

# Load your dataset
data_file = "output.json"
dataset = load_dataset("json", data_files=data_file)
dataset = dataset["train"].train_test_split(test_size=0.1)
train_data = dataset["train"]
val_data = dataset["test"]

# Load the tokenizer and model
model_name = "distilgpt2"  # Smaller model to fit on laptops
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Add padding token if not set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Preprocess dataset
def preprocess_function(examples):
    # Create input-output pairs
    inputs = [f"Context: {c}\nQuestion: {q}" for c, q in zip(examples["Context"], examples["Question"])]
    outputs = examples["Answer"]
    
    # Tokenize inputs and outputs
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
    labels = tokenizer(outputs, max_length=64, truncation=True, padding="max_length")["input_ids"]
    
    # Replace padding token IDs with -100 (ignored during loss calculation)
    labels = [[-100 if token == tokenizer.pad_token_id else token for token in label_seq] for label_seq in labels]
    model_inputs["labels"] = labels
    return model_inputs

# Apply preprocessing to the datasets
train_dataset = train_data.map(preprocess_function, batched=True).with_format("torch")
val_dataset = val_data.map(preprocess_function, batched=True).with_format("torch")

# Data collator for batching
data_collator = DataCollatorForSeq2Seq(tokenizer, padding=True, return_tensors="pt")

# Define training arguments
training_args = TrainingArguments(
    output_dir="./fine_tuned_distilgpt2",
    evaluation_strategy="epoch",
    learning_rate=2e-5,  # Lower learning rate for stability
    per_device_train_batch_size=1,  # Small batch size for memory constraints
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,  # Simulate a larger batch size
    num_train_epochs=3,  # Adjust based on dataset size
    save_strategy="epoch",
    logging_steps=10,
    fp16=torch.cuda.is_available(),  # Use mixed precision if CUDA is available
    no_cuda=not torch.cuda.is_available(),  # Use CPU if no GPU is available
    report_to="none"  # Disable W&B or other integrations
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_distilgpt2")
tokenizer.save_pretrained("./fine_tuned_distilgpt2")
print("Fine-tuning completed and model saved!")


loading configuration file config.json from cache at C:\Users\user-pc/.cache\huggingface\hub\models--distilgpt2\snapshots\2290a62682d06624634c1f46a6ad5be0f47f38aa\config.json
Model config GPT2Config {
  "_name_or_path": "distilgpt2",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_spec

Map:   0%|          | 0/8987 [00:00<?, ? examples/s]

Map:   0%|          | 0/999 [00:00<?, ? examples/s]

PyTorch: setting up devices
The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: Context, Question, Answer. If Context, Question, Answer are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 8987
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 8
  Total optimization steps = 3369
  Number of trainable parameters = 81912576
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


ValueError: Expected input batch_size (255) to match target batch_size (63).

In [43]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from datasets import load_dataset
import torch


In [44]:
# Load the dataset
data_file = "output.json"  # Replace with the correct path to your dataset
dataset = load_dataset("json", data_files=data_file)

# Split the dataset
dataset = dataset["train"].train_test_split(test_size=0.1)
train_data = dataset["train"]
val_data = dataset["test"]


In [45]:
# Load tokenizer and model
model_name = "distilgpt2"  # You can use a different model if required
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Set the padding token if not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


loading configuration file config.json from cache at C:\Users\user-pc/.cache\huggingface\hub\models--distilgpt2\snapshots\2290a62682d06624634c1f46a6ad5be0f47f38aa\config.json
Model config GPT2Config {
  "_name_or_path": "distilgpt2",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_spec

In [46]:
# Preprocessing function
def preprocess_function(examples):
    # Combine context and question for input
    inputs = [f"Context: {c}\nQuestion: {q}" for c, q in zip(examples["Context"], examples["Question"])]
    outputs = examples["Answer"]

    # Tokenize inputs and outputs
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
    labels = tokenizer(outputs, max_length=256, truncation=True, padding="max_length")["input_ids"]

    # Replace padding token ID in labels with -100 to ignore during loss computation
    labels = [[-100 if token == tokenizer.pad_token_id else token for token in label_seq] for label_seq in labels]
    model_inputs["labels"] = labels
    return model_inputs


In [47]:
# Apply preprocessing
train_dataset = train_data.map(preprocess_function, batched=True).with_format("torch")
val_dataset = val_data.map(preprocess_function, batched=True).with_format("torch")

# Print a sample for verification
print(train_dataset[0])


Map:   0%|          | 0/8987 [00:00<?, ? examples/s]

Map:   0%|          | 0/999 [00:00<?, ? examples/s]

{'Question': 'what was the key evidence against taijuddin presented by the prosecution?', 'Answer': 'the key evidence against taijuddin was the testimony of witnesses who stated that he pointed out the location of the victim, abdul wahab, to the mob.', 'Context': 'taijuddin vs state of assam & ors.: 1st december 2021', 'input_ids': tensor([21947,    25,   256,  1872,    73, 44008,  3691,  1181,   286,   840,
          321,  1222,   393,    82, 11207,   352,   301,   390,  3273, 33448,
          198, 24361,    25,   644,   373,   262,  1994,  2370,  1028,   256,
         1872,    73, 44008,  5545,   416,   262, 12580,    30, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256,

In [48]:
training_args = TrainingArguments(
    output_dir="./fine_tuned_distilgpt2",
    evaluation_strategy="epoch",
    learning_rate=2e-5,  # Adjust the learning rate if necessary
    per_device_train_batch_size=1,  # Reduce batch size for memory constraints
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16,  # Simulate a larger batch size
    num_train_epochs=3,  # Adjust epochs as required
    save_strategy="epoch",
    logging_steps=10,
    fp16=torch.cuda.is_available(),  # Enable mixed precision if GPU is available
    no_cuda=not torch.cuda.is_available(),  # Use CPU if no GPU is available
    report_to="none"
)


PyTorch: setting up devices


In [49]:
data_collator = DataCollatorForSeq2Seq(tokenizer, padding=True, return_tensors="pt")


In [50]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)


In [None]:
trainer.train()


The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: Context, Question, Answer. If Context, Question, Answer are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 8987
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 1683
  Number of trainable parameters = 81912576
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
