In [None]:
!pip install -q transformers datasets peft accelerate evaluate


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import json
import numpy as np
import pandas as pd
import os
import random
from sklearn.preprocessing import LabelEncoder
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
import evaluate
from peft import LoraConfig, get_peft_model, TaskType

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA devices:", torch.cuda.device_count())
    try:
        print("Device 0 name:", torch.cuda.get_device_name(0))
    except Exception as e:
        print("Couldn't query device name:", e)

os.system("nvidia-smi")

def download_data_and_show_metadata(filename):
    with open(filename, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

raw_data = download_data_and_show_metadata("trainset.json")

def convert_conversation_to_samples(conversation: dict):
    samples = []
    history = conversation.get("conversation_history", "")
    tutor_responses = conversation.get("tutor_responses", {})
    for tutor_name, resp_data in tutor_responses.items():
        response_text = resp_data.get("response", "")
        annotation = resp_data.get("annotation", {})
        sample = {
            "conversation_id": conversation.get("conversation_id"),
            "tutor_name": tutor_name,
            "input_text": (history + "\nTutor Response: " + response_text).lower(),
            "label_mistake": annotation.get("Mistake_Identification"),
            "label_guidance": annotation.get("Providing_Guidance"),
        }
        samples.append(sample)
    return samples

def convert_dataset(raw_data_list: list):
    all_samples = []
    for conv in raw_data_list:
        all_samples.extend(convert_conversation_to_samples(conv))
    return all_samples

train_samples = convert_dataset(raw_data)
df = pd.DataFrame(train_samples)
df = df.dropna(subset=["input_text", "label_mistake"]).reset_index(drop=True)

label_encoder = LabelEncoder()
df["label_guidance_id"] = label_encoder.fit_transform(df["label_guidance"])
num_labels = int(len(label_encoder.classes_))
print("num_labels =", num_labels)
print("Label classes:", label_encoder.classes_)
print("Label ID:", df.head(5))

min_class_size = df["label_guidance_id"].value_counts().min()
print("Balancing classes to size:", min_class_size)

balanced_df = (
    df.groupby("label_guidance_id", group_keys=False)
      .apply(lambda x: x.sample(min_class_size, random_state=SEED))
      .reset_index(drop=True)
)

print("Class distribution after balancing:")
print(balanced_df["label_guidance_id"].value_counts())

hf_dataset = Dataset.from_pandas(balanced_df.reset_index(drop=True))
# hf_dataset = Dataset.from_pandas(df.reset_index(drop=True))

MODEL_NAME = "MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)

def preprocess_batch(batch):
    prompt = "Instruction: Determine if the tutor provides guidance in the following response. Options: Yes, No, To some extent. Response:"
    batch["input_text"] = [prompt + text for text in batch["input_text"]]
    tokenized = tokenizer(
        batch["input_text"],
        truncation=True,
        padding="max_length",
        max_length=512,
    )
    tokenized["labels"] = batch["label_guidance_id"]
    return tokenized

hf_dataset = hf_dataset.map(preprocess_batch, batched=True, remove_columns=["conversation_id","tutor_name","input_text","label_mistake","label_guidance","label_guidance_id"])

dataset = hf_dataset.train_test_split(test_size=0.2, seed=SEED)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

def cast_labels(example):
    example["labels"] = int(example["labels"])
    return example

train_dataset = train_dataset.map(cast_labels)
eval_dataset = eval_dataset.map(cast_labels)

columns = ["input_ids", "attention_mask", "labels"]
train_dataset = train_dataset.remove_columns([c for c in train_dataset.column_names if c not in columns])
eval_dataset = eval_dataset.remove_columns([c for c in eval_dataset.column_names if c not in columns])

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    use_safetensors=True
)



model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.config.bos_token_id = tokenizer.bos_token_id


linear_modules = []
for name, module in model.named_modules():
    if isinstance(module, torch.nn.Linear):
        linear_modules.append(name)
for nm in linear_modules[:200]:
    print(nm)
print("Total linear count:", len(linear_modules))
print("---- end linear module list ----\n")

candidate_substrings = ["query_proj", "key_proj", "value_proj", "dense", "o_proj", "v_proj", "out_proj", "projection"]
found_subs = []
for sub in candidate_substrings:
    for nm in linear_modules:
        if sub in nm:
            found_subs.append(sub)
            break
found_subs = sorted(set(found_subs))
print("Detected candidate substrings present in model:", found_subs)

if not found_subs:
    print("WARNING: None of the common adapter substrings were found in linear module names.")
    print("Falling back to only training the classifier head. Consider adjusting target_modules manually.")
    target_modules = ["classifier"]
else:
    preferred = []
    for choice in ["query_proj", "key_proj", "value_proj", "dense"]:
        if choice in found_subs:
            preferred.append(choice)
    if not preferred:
        preferred = found_subs
    target_modules = preferred

print("Using target_modules for LoRA:", target_modules)

lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=32,
    lora_alpha=64,
    lora_dropout=0.1,
    target_modules=target_modules,
)

model = get_peft_model(model, lora_config)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

print("\nLoRA modules added. Trainable params summary:")
model.print_trainable_parameters()
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    results = {}
    results.update(accuracy.compute(predictions=preds, references=labels))
    try:
        results.update(precision.compute(predictions=preds, references=labels, average="macro", zero_division=0))
    except TypeError:
        results.update(precision.compute(predictions=preds, references=labels, average="macro"))
    results.update(recall.compute(predictions=preds, references=labels, average="macro"))
    results.update(f1.compute(predictions=preds, references=labels, average="macro"))
    return results

training_args = TrainingArguments(
    output_dir="label-guidance-model-lora",
    learning_rate=1.5e-4,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=2,
    num_train_epochs=10,
    weight_decay=0.01,
    eval_strategy="epoch",
    eval_steps=200,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    label_smoothing_factor=0.1,     # added
    logging_steps=50,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# trainer.train(resume_from_checkpoint=True)
trainer.train()

eval_results = trainer.evaluate(eval_dataset)
print("Final evaluation results:", eval_results)

PEFT_OUTPUT_DIR = "label-guidance-model-lora/peft_adapter"
model.save_pretrained(PEFT_OUTPUT_DIR)
tokenizer.save_pretrained("label-guidance-model-lora/tokenizer")
print(f"Saved PEFT adapter to {PEFT_OUTPUT_DIR} and tokenizer to label-guidance-model-lora/tokenizer")

try:
    trainer.model.base_model.save_pretrained("label-guidance-model-lora/base_model")
    print("Saved base model.")
except Exception as e:
    print("Skipping saving base model (too large or unsupported):", e)


CUDA available: True
CUDA devices: 1
Device 0 name: Tesla T4
num_labels = 3
Label classes: ['No' 'To some extent' 'Yes']
Label ID:                             conversation_id   tutor_name  \
0  221-362eb11a-f190-42a6-b2a4-985fafdcfa9e       Sonnet   
1  221-362eb11a-f190-42a6-b2a4-985fafdcfa9e    Llama318B   
2  221-362eb11a-f190-42a6-b2a4-985fafdcfa9e  Llama31405B   
3  221-362eb11a-f190-42a6-b2a4-985fafdcfa9e         GPT4   
4  221-362eb11a-f190-42a6-b2a4-985fafdcfa9e      Mistral   

                                          input_text label_mistake  \
0  tutor: hi, could you please provide a step-by-...           Yes   
1  tutor: hi, could you please provide a step-by-...           Yes   
2  tutor: hi, could you please provide a step-by-...           Yes   
3  tutor: hi, could you please provide a step-by-...           Yes   
4  tutor: hi, could you please provide a step-by-...           Yes   

   label_guidance  label_guidance_id  
0             Yes                  2  
1  To som

  .apply(lambda x: x.sample(min_class_size, random_state=SEED))


Map:   0%|          | 0/1509 [00:00<?, ? examples/s]

Map:   0%|          | 0/1207 [00:00<?, ? examples/s]

Map:   0%|          | 0/302 [00:00<?, ? examples/s]

deberta.encoder.layer.0.attention.self.query_proj
deberta.encoder.layer.0.attention.self.key_proj
deberta.encoder.layer.0.attention.self.value_proj
deberta.encoder.layer.0.attention.output.dense
deberta.encoder.layer.0.intermediate.dense
deberta.encoder.layer.0.output.dense
deberta.encoder.layer.1.attention.self.query_proj
deberta.encoder.layer.1.attention.self.key_proj
deberta.encoder.layer.1.attention.self.value_proj
deberta.encoder.layer.1.attention.output.dense
deberta.encoder.layer.1.intermediate.dense
deberta.encoder.layer.1.output.dense
deberta.encoder.layer.2.attention.self.query_proj
deberta.encoder.layer.2.attention.self.key_proj
deberta.encoder.layer.2.attention.self.value_proj
deberta.encoder.layer.2.attention.output.dense
deberta.encoder.layer.2.intermediate.dense
deberta.encoder.layer.2.output.dense
deberta.encoder.layer.3.attention.self.query_proj
deberta.encoder.layer.3.attention.self.key_proj
deberta.encoder.layer.3.attention.self.value_proj
deberta.encoder.layer.3.att

RuntimeError: Error(s) in loading state_dict for PeftModelForSequenceClassification:
	size mismatch for base_model.model.deberta.encoder.layer.0.attention.self.query_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.0.attention.self.query_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.0.attention.self.key_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.0.attention.self.key_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.0.attention.self.value_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.0.attention.self.value_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.0.attention.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.0.attention.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.0.intermediate.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.0.intermediate.dense.lora_B.default.weight: copying a param with shape torch.Size([4096, 16]) from checkpoint, the shape in current model is torch.Size([4096, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.0.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 4096]) from checkpoint, the shape in current model is torch.Size([32, 4096]).
	size mismatch for base_model.model.deberta.encoder.layer.0.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.1.attention.self.query_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.1.attention.self.query_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.1.attention.self.key_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.1.attention.self.key_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.1.attention.self.value_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.1.attention.self.value_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.1.attention.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.1.attention.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.1.intermediate.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.1.intermediate.dense.lora_B.default.weight: copying a param with shape torch.Size([4096, 16]) from checkpoint, the shape in current model is torch.Size([4096, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.1.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 4096]) from checkpoint, the shape in current model is torch.Size([32, 4096]).
	size mismatch for base_model.model.deberta.encoder.layer.1.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.2.attention.self.query_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.2.attention.self.query_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.2.attention.self.key_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.2.attention.self.key_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.2.attention.self.value_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.2.attention.self.value_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.2.attention.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.2.attention.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.2.intermediate.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.2.intermediate.dense.lora_B.default.weight: copying a param with shape torch.Size([4096, 16]) from checkpoint, the shape in current model is torch.Size([4096, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.2.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 4096]) from checkpoint, the shape in current model is torch.Size([32, 4096]).
	size mismatch for base_model.model.deberta.encoder.layer.2.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.3.attention.self.query_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.3.attention.self.query_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.3.attention.self.key_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.3.attention.self.key_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.3.attention.self.value_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.3.attention.self.value_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.3.attention.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.3.attention.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.3.intermediate.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.3.intermediate.dense.lora_B.default.weight: copying a param with shape torch.Size([4096, 16]) from checkpoint, the shape in current model is torch.Size([4096, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.3.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 4096]) from checkpoint, the shape in current model is torch.Size([32, 4096]).
	size mismatch for base_model.model.deberta.encoder.layer.3.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.4.attention.self.query_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.4.attention.self.query_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.4.attention.self.key_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.4.attention.self.key_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.4.attention.self.value_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.4.attention.self.value_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.4.attention.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.4.attention.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.4.intermediate.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.4.intermediate.dense.lora_B.default.weight: copying a param with shape torch.Size([4096, 16]) from checkpoint, the shape in current model is torch.Size([4096, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.4.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 4096]) from checkpoint, the shape in current model is torch.Size([32, 4096]).
	size mismatch for base_model.model.deberta.encoder.layer.4.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.5.attention.self.query_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.5.attention.self.query_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.5.attention.self.key_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.5.attention.self.key_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.5.attention.self.value_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.5.attention.self.value_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.5.attention.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.5.attention.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.5.intermediate.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.5.intermediate.dense.lora_B.default.weight: copying a param with shape torch.Size([4096, 16]) from checkpoint, the shape in current model is torch.Size([4096, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.5.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 4096]) from checkpoint, the shape in current model is torch.Size([32, 4096]).
	size mismatch for base_model.model.deberta.encoder.layer.5.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.6.attention.self.query_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.6.attention.self.query_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.6.attention.self.key_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.6.attention.self.key_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.6.attention.self.value_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.6.attention.self.value_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.6.attention.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.6.attention.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.6.intermediate.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.6.intermediate.dense.lora_B.default.weight: copying a param with shape torch.Size([4096, 16]) from checkpoint, the shape in current model is torch.Size([4096, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.6.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 4096]) from checkpoint, the shape in current model is torch.Size([32, 4096]).
	size mismatch for base_model.model.deberta.encoder.layer.6.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.7.attention.self.query_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.7.attention.self.query_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.7.attention.self.key_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.7.attention.self.key_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.7.attention.self.value_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.7.attention.self.value_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.7.attention.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.7.attention.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.7.intermediate.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.7.intermediate.dense.lora_B.default.weight: copying a param with shape torch.Size([4096, 16]) from checkpoint, the shape in current model is torch.Size([4096, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.7.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 4096]) from checkpoint, the shape in current model is torch.Size([32, 4096]).
	size mismatch for base_model.model.deberta.encoder.layer.7.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.8.attention.self.query_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.8.attention.self.query_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.8.attention.self.key_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.8.attention.self.key_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.8.attention.self.value_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.8.attention.self.value_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.8.attention.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.8.attention.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.8.intermediate.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.8.intermediate.dense.lora_B.default.weight: copying a param with shape torch.Size([4096, 16]) from checkpoint, the shape in current model is torch.Size([4096, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.8.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 4096]) from checkpoint, the shape in current model is torch.Size([32, 4096]).
	size mismatch for base_model.model.deberta.encoder.layer.8.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.9.attention.self.query_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.9.attention.self.query_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.9.attention.self.key_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.9.attention.self.key_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.9.attention.self.value_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.9.attention.self.value_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.9.attention.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.9.attention.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.9.intermediate.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.9.intermediate.dense.lora_B.default.weight: copying a param with shape torch.Size([4096, 16]) from checkpoint, the shape in current model is torch.Size([4096, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.9.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 4096]) from checkpoint, the shape in current model is torch.Size([32, 4096]).
	size mismatch for base_model.model.deberta.encoder.layer.9.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.10.attention.self.query_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.10.attention.self.query_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.10.attention.self.key_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.10.attention.self.key_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.10.attention.self.value_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.10.attention.self.value_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.10.attention.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.10.attention.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.10.intermediate.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.10.intermediate.dense.lora_B.default.weight: copying a param with shape torch.Size([4096, 16]) from checkpoint, the shape in current model is torch.Size([4096, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.10.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 4096]) from checkpoint, the shape in current model is torch.Size([32, 4096]).
	size mismatch for base_model.model.deberta.encoder.layer.10.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.11.attention.self.query_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.11.attention.self.query_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.11.attention.self.key_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.11.attention.self.key_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.11.attention.self.value_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.11.attention.self.value_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.11.attention.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.11.attention.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.11.intermediate.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.11.intermediate.dense.lora_B.default.weight: copying a param with shape torch.Size([4096, 16]) from checkpoint, the shape in current model is torch.Size([4096, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.11.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 4096]) from checkpoint, the shape in current model is torch.Size([32, 4096]).
	size mismatch for base_model.model.deberta.encoder.layer.11.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.12.attention.self.query_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.12.attention.self.query_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.12.attention.self.key_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.12.attention.self.key_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.12.attention.self.value_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.12.attention.self.value_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.12.attention.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.12.attention.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.12.intermediate.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.12.intermediate.dense.lora_B.default.weight: copying a param with shape torch.Size([4096, 16]) from checkpoint, the shape in current model is torch.Size([4096, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.12.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 4096]) from checkpoint, the shape in current model is torch.Size([32, 4096]).
	size mismatch for base_model.model.deberta.encoder.layer.12.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.13.attention.self.query_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.13.attention.self.query_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.13.attention.self.key_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.13.attention.self.key_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.13.attention.self.value_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.13.attention.self.value_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.13.attention.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.13.attention.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.13.intermediate.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.13.intermediate.dense.lora_B.default.weight: copying a param with shape torch.Size([4096, 16]) from checkpoint, the shape in current model is torch.Size([4096, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.13.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 4096]) from checkpoint, the shape in current model is torch.Size([32, 4096]).
	size mismatch for base_model.model.deberta.encoder.layer.13.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.14.attention.self.query_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.14.attention.self.query_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.14.attention.self.key_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.14.attention.self.key_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.14.attention.self.value_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.14.attention.self.value_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.14.attention.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.14.attention.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.14.intermediate.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.14.intermediate.dense.lora_B.default.weight: copying a param with shape torch.Size([4096, 16]) from checkpoint, the shape in current model is torch.Size([4096, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.14.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 4096]) from checkpoint, the shape in current model is torch.Size([32, 4096]).
	size mismatch for base_model.model.deberta.encoder.layer.14.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.15.attention.self.query_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.15.attention.self.query_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.15.attention.self.key_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.15.attention.self.key_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.15.attention.self.value_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.15.attention.self.value_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.15.attention.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.15.attention.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.15.intermediate.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.15.intermediate.dense.lora_B.default.weight: copying a param with shape torch.Size([4096, 16]) from checkpoint, the shape in current model is torch.Size([4096, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.15.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 4096]) from checkpoint, the shape in current model is torch.Size([32, 4096]).
	size mismatch for base_model.model.deberta.encoder.layer.15.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.16.attention.self.query_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.16.attention.self.query_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.16.attention.self.key_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.16.attention.self.key_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.16.attention.self.value_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.16.attention.self.value_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.16.attention.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.16.attention.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.16.intermediate.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.16.intermediate.dense.lora_B.default.weight: copying a param with shape torch.Size([4096, 16]) from checkpoint, the shape in current model is torch.Size([4096, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.16.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 4096]) from checkpoint, the shape in current model is torch.Size([32, 4096]).
	size mismatch for base_model.model.deberta.encoder.layer.16.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.17.attention.self.query_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.17.attention.self.query_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.17.attention.self.key_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.17.attention.self.key_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.17.attention.self.value_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.17.attention.self.value_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.17.attention.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.17.attention.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.17.intermediate.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.17.intermediate.dense.lora_B.default.weight: copying a param with shape torch.Size([4096, 16]) from checkpoint, the shape in current model is torch.Size([4096, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.17.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 4096]) from checkpoint, the shape in current model is torch.Size([32, 4096]).
	size mismatch for base_model.model.deberta.encoder.layer.17.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.18.attention.self.query_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.18.attention.self.query_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.18.attention.self.key_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.18.attention.self.key_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.18.attention.self.value_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.18.attention.self.value_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.18.attention.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.18.attention.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.18.intermediate.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.18.intermediate.dense.lora_B.default.weight: copying a param with shape torch.Size([4096, 16]) from checkpoint, the shape in current model is torch.Size([4096, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.18.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 4096]) from checkpoint, the shape in current model is torch.Size([32, 4096]).
	size mismatch for base_model.model.deberta.encoder.layer.18.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.19.attention.self.query_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.19.attention.self.query_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.19.attention.self.key_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.19.attention.self.key_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.19.attention.self.value_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.19.attention.self.value_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.19.attention.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.19.attention.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.19.intermediate.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.19.intermediate.dense.lora_B.default.weight: copying a param with shape torch.Size([4096, 16]) from checkpoint, the shape in current model is torch.Size([4096, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.19.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 4096]) from checkpoint, the shape in current model is torch.Size([32, 4096]).
	size mismatch for base_model.model.deberta.encoder.layer.19.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.20.attention.self.query_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.20.attention.self.query_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.20.attention.self.key_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.20.attention.self.key_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.20.attention.self.value_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.20.attention.self.value_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.20.attention.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.20.attention.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.20.intermediate.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.20.intermediate.dense.lora_B.default.weight: copying a param with shape torch.Size([4096, 16]) from checkpoint, the shape in current model is torch.Size([4096, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.20.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 4096]) from checkpoint, the shape in current model is torch.Size([32, 4096]).
	size mismatch for base_model.model.deberta.encoder.layer.20.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.21.attention.self.query_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.21.attention.self.query_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.21.attention.self.key_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.21.attention.self.key_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.21.attention.self.value_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.21.attention.self.value_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.21.attention.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.21.attention.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.21.intermediate.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.21.intermediate.dense.lora_B.default.weight: copying a param with shape torch.Size([4096, 16]) from checkpoint, the shape in current model is torch.Size([4096, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.21.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 4096]) from checkpoint, the shape in current model is torch.Size([32, 4096]).
	size mismatch for base_model.model.deberta.encoder.layer.21.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.22.attention.self.query_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.22.attention.self.query_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.22.attention.self.key_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.22.attention.self.key_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.22.attention.self.value_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.22.attention.self.value_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.22.attention.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.22.attention.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.22.intermediate.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.22.intermediate.dense.lora_B.default.weight: copying a param with shape torch.Size([4096, 16]) from checkpoint, the shape in current model is torch.Size([4096, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.22.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 4096]) from checkpoint, the shape in current model is torch.Size([32, 4096]).
	size mismatch for base_model.model.deberta.encoder.layer.22.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.23.attention.self.query_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.23.attention.self.query_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.23.attention.self.key_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.23.attention.self.key_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.23.attention.self.value_proj.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.23.attention.self.value_proj.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.23.attention.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.23.attention.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.23.intermediate.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.deberta.encoder.layer.23.intermediate.dense.lora_B.default.weight: copying a param with shape torch.Size([4096, 16]) from checkpoint, the shape in current model is torch.Size([4096, 32]).
	size mismatch for base_model.model.deberta.encoder.layer.23.output.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 4096]) from checkpoint, the shape in current model is torch.Size([32, 4096]).
	size mismatch for base_model.model.deberta.encoder.layer.23.output.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).
	size mismatch for base_model.model.pooler.dense.lora_A.default.weight: copying a param with shape torch.Size([16, 1024]) from checkpoint, the shape in current model is torch.Size([32, 1024]).
	size mismatch for base_model.model.pooler.dense.lora_B.default.weight: copying a param with shape torch.Size([1024, 16]) from checkpoint, the shape in current model is torch.Size([1024, 32]).

In [None]:
import torch
import pandas as pd
from transformers import AutoTokenizer
from peft import AutoPeftModelForSequenceClassification
import json
import gc

PEFT_OUTPUT_DIR = "label-guidance-model-lora/peft_adapter"
TOKENIZER_DIR = "label-guidance-model-lora/tokenizer"
BASE_MODEL = "MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli"

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, use_fast=False)
model = AutoPeftModelForSequenceClassification.from_pretrained(
    PEFT_OUTPUT_DIR,
    torch_dtype=torch.float16,
    device_map="auto",
    offload_folder="offload"
)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
model.eval()

def download_data_and_show_metadata(filename):
    with open(filename, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

raw_data = download_data_and_show_metadata("testset.json")

def convert_conversation_to_samples(conversation: dict):
    samples = []
    history = conversation.get("conversation_history", "")
    tutor_responses = conversation.get("tutor_responses", {})
    for tutor_name, resp_data in tutor_responses.items():
        response_text = resp_data.get("response", "")
        # annotation = resp_data.get("annotation", {})
        sample = {
            "conversation_id": conversation.get("conversation_id"),
            "tutor_name": tutor_name,
            "conversation_history":history,
            "response":response_text,
            "input_text": history + "\nTutor Response: " + response_text,
            # "label_mistake": annotation.get("Mistake_Identification"),
            # "label_guidance": annotation.get("Providing_Guidance"),
        }
        samples.append(sample)
    return samples

def convert_dataset(raw_data_list: list):
    all_samples = []
    for conv in raw_data_list:
        all_samples.extend(convert_conversation_to_samples(conv))
    return all_samples

test_samples = convert_dataset(raw_data)
df = pd.DataFrame(test_samples)

predictions = []

with torch.no_grad():
    for idx, text in enumerate(df["input_text"].tolist()):
        inputs = tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=512,
            return_tensors="pt"
        ).to(device)

        with torch.cuda.amp.autocast():
            outputs = model(**inputs)
            logits = outputs.logits
            pred = torch.argmax(logits, dim=-1).cpu().item()

        predictions.append(pred)

        del inputs, outputs, logits
        torch.cuda.empty_cache()
        gc.collect()

        if idx % 100 == 0:
            print(f"Processed {idx}/{len(df)} samples")

df["predicted_label_id"] = predictions
df.to_csv("SECOND__Model_GUIDANCE_Predictions_For_TESTSET2.csv", index=False)
print(df.head())


  with torch.cuda.amp.autocast():


Processed 0/1214 samples
Processed 100/1214 samples
Processed 200/1214 samples
Processed 300/1214 samples
Processed 400/1214 samples
Processed 500/1214 samples
Processed 600/1214 samples
Processed 700/1214 samples
Processed 800/1214 samples
Processed 900/1214 samples
Processed 1000/1214 samples
Processed 1100/1214 samples
Processed 1200/1214 samples
                             conversation_id   tutor_name  \
0  4181-2ef5457c-9ae2-4c67-9f32-3d6d367d8c82  Llama31405B   
1  4181-2ef5457c-9ae2-4c67-9f32-3d6d367d8c82       Expert   
2  4181-2ef5457c-9ae2-4c67-9f32-3d6d367d8c82         Phi3   
3  4181-2ef5457c-9ae2-4c67-9f32-3d6d367d8c82         GPT4   
4  4181-2ef5457c-9ae2-4c67-9f32-3d6d367d8c82    Llama318B   

                                conversation_history  \
0  Tutor:  Hi, could you please provide a step-by...   
1  Tutor:  Hi, could you please provide a step-by...   
2  Tutor:  Hi, could you please provide a step-by...   
3  Tutor:  Hi, could you please provide a step-by...   
4

In [None]:
import pandas as pd

data = pd.read_csv('./SECOND__Model_GUIDANCE_Predictions_For_TESTSET2.csv')
data.tail(5)

Unnamed: 0,conversation_id,tutor_name,conversation_history,response,input_text,predicted_label_id
1209,3340-9012188c-550a-4710-9f7c-cceec4006831,Phi3,"Tutor: Hi, could you please provide a step-by...","To find the area of a rectangle, multiply its ...","Tutor: Hi, could you please provide a step-by...",1
1210,3340-9012188c-550a-4710-9f7c-cceec4006831,Llama31405B,"Tutor: Hi, could you please provide a step-by...",I can see that you also set up a proportion an...,"Tutor: Hi, could you please provide a step-by...",1
1211,3340-9012188c-550a-4710-9f7c-cceec4006831,GPT4,"Tutor: Hi, could you please provide a step-by...","Good job on setting up the proportion, but the...","Tutor: Hi, could you please provide a step-by...",1
1212,3340-9012188c-550a-4710-9f7c-cceec4006831,Mistral,"Tutor: Hi, could you please provide a step-by...","""Great start using a proportion, but consider ...","Tutor: Hi, could you please provide a step-by...",1
1213,3340-9012188c-550a-4710-9f7c-cceec4006831,Gemini,"Tutor: Hi, could you please provide a step-by...","That's a great start, but remember, the more p...","Tutor: Hi, could you please provide a step-by...",1


In [None]:
mp = {
    0: "No",
    1: "To some extent",
    2: "Yes"
}

data["predicted_label"] = data["predicted_label_id"].map(mp)
data.head(5)

Unnamed: 0,conversation_id,tutor_name,conversation_history,response,input_text,predicted_label_id,predicted_label
0,4181-2ef5457c-9ae2-4c67-9f32-3d6d367d8c82,Llama31405B,"Tutor: Hi, could you please provide a step-by...",It looks like there's still a bit of confusion...,"Tutor: Hi, could you please provide a step-by...",2,Yes
1,4181-2ef5457c-9ae2-4c67-9f32-3d6d367d8c82,Expert,"Tutor: Hi, could you please provide a step-by...",But you assumed x to be number of girls that d...,"Tutor: Hi, could you please provide a step-by...",2,Yes
2,4181-2ef5457c-9ae2-4c67-9f32-3d6d367d8c82,Phi3,"Tutor: Hi, could you please provide a step-by...",Great job! Can you explain how you arrived at ...,"Tutor: Hi, could you please provide a step-by...",0,No
3,4181-2ef5457c-9ae2-4c67-9f32-3d6d367d8c82,GPT4,"Tutor: Hi, could you please provide a step-by...",Great job correcting your mistake! Now you've ...,"Tutor: Hi, could you please provide a step-by...",0,No
4,4181-2ef5457c-9ae2-4c67-9f32-3d6d367d8c82,Llama318B,"Tutor: Hi, could you please provide a step-by...","Your revised solution looks good, but there's ...","Tutor: Hi, could you please provide a step-by...",1,To some extent


In [None]:
import pandas as pd

def convert_to_json(df):
    result = []

    for conv_id, group in df.groupby("conversation_id"):
        conversation_history = group["conversation_history"].iloc[0]  # take first as history

        tutor_responses = {}
        for _, row in group.iterrows():
            tutor_responses[row["tutor_name"]] = {
                "response": row["response"],
                "annotation": {
                    "Providing_Guidance": row["predicted_label"],
                }
            }

        result.append({
            "conversation_id": conv_id,
            "conversation_history": conversation_history,
            "tutor_responses": tutor_responses
        })

    return result


In [None]:
# Example: assuming your dataframe is called data
json_data = convert_to_json(data)

import json

# Pretty print first conversation to check
print(json.dumps(json_data[0], indent=2))

# ✅ Save the converted JSON, not the DataFrame
with open("providing_guidance_test_data_predictions.json", "w", encoding="utf-8") as f:
    json.dump(json_data, f, indent=4, ensure_ascii=False)

print("JSON saved successfully ✅")

{
  "conversation_id": "01-374a3eb6-95cf-4725-9e76-86a8972aa5cb",
  "conversation_history": "Tutor:  Hi, could you please provide a step-by-step solution for the question below? The question is: Julia was preparing for a dinner party at her house, where she intended to serve stew.  She noticed that she was out of plastic spoons, so she bought a new package of spoons.  Later, her husband also bought a package of 5 new spoons and gave them to Julia.  While Julia was making the stew, she used three of the spoons to sample her stew.  Later, when she went to set the table, she had a total of 12 spoons.  How many spoons were in the package that Julia bought? \n Student: Let's call the number of spoons Julia bought \"x\". \nHer husband bought 5 more spoons, so the total number of spoons is now x + 5. \nJulia used 3 spoons to sample her stew, so she had 12 - 3 = 9 spoons left. \nWe know that the total number of spoons is x + 5, so we can set up an equation: \n\nx + 5 = 9 \n\nSubtracting 5 from