In [11]:
import random
import numpy as np


## katchi code

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DebertaTokenizer, DebertaForSequenceClassification, Trainer, TrainingArguments
import torch

# load data
df = pd.read_csv('final.csv')  # replace with your csv file
df.info()

# Use the sample data on my local machine. Please change it when running on VM
texts = random.sample(df['info_sentence'].tolist(), 100)
labels = random.sample(df['meta'].tolist(), 100)


# Split into training and validation before converting to tensors
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2)

# load tokenizer and model
tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')

# tokenize data for training set
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512, return_tensors='pt')
train_encodings['labels'] = torch.tensor(train_labels)

# tokenize data for validation set
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512, return_tensors='pt')
val_encodings['labels'] = torch.tensor(val_labels)

# Convert encodings to a Dataset format
class CardInfoDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12489 entries, 0 to 12488
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   info_sentence  12489 non-null  object
 1   meta           12489 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 195.3+ KB


In [15]:
from sklearn.metrics import accuracy_score

train_dataset = CardInfoDataset(train_encodings)
val_dataset = CardInfoDataset(val_encodings)

#function to compute accuracy
def compute_accuracy(preds, labels):
    preds = np.argmax(preds, axis=1)
    return accuracy_score(labels, preds)


# define the training args
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=3,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=64,   
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',            
)

model = DebertaForSequenceClassification.from_pretrained('microsoft/deberta-base', num_labels=len(set(labels)))

# create the trainer
trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=val_dataset,
    compute_metrics=lambda pred: {"accuracy": compute_accuracy(pred.predictions, pred.label_ids)}             
)

# train the model
trainer.train()


# save the model
model.save_pretrained("./deberta_model")

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.weight', 'pooler.d

  0%|          | 0/15 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'train_runtime': 98.0359, 'train_samples_per_second': 2.448, 'train_steps_per_second': 0.153, 'train_loss': 0.648587417602539, 'epoch': 3.0}


### Testing case

In [19]:
# ... existing code ...

# Input data for inference
input_data = ["Magician Girl", "Blue-Eyes White Dragon", "Saint Dragon - The God of Osiris"]

# Preprocess the input data
input_encodings = tokenizer(input_data, truncation=True, padding=True)

# Create a dataset for inference
inference_dataset = CardInfoDataset(input_encodings)

# Perform inference using the trained model
predictions = trainer.predict(test_dataset=inference_dataset)

# Get the predicted labels
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Print the predicted labels
for input_text, label in zip(input_data, predicted_labels):
    print(f"Input: {input_text}\tPredicted Label: {label}")


  0%|          | 0/1 [00:00<?, ?it/s]

Input: Magician Girl	Predicted Label: 0
Input: Blue-Eyes White Dragon	Predicted Label: 0
Input: Saint Dragon - The God of Osiris	Predicted Label: 0
