<a href="https://colab.research.google.com/github/Metachondria/LLM_Kaggle_Competition/blob/main/LLM_Competion_Kaggle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
from torch.optim.lr_scheduler import ReduceLROnPlateau
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import accuracy_score

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import spacy
import re

from tqdm.auto import tqdm

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [None]:
train_data = pd.read_csv('/kaggle/input/llm-classification-finetuning/train.csv')
test_data = pd.read_csv('/kaggle/input/llm-classification-finetuning/test.csv')

In [None]:
train_data.head()

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,30192,gpt-4-1106-preview,gpt-4-0613,"[""Is it morally right to try to have a certain...","[""The question of whether it is morally right ...","[""As an AI, I don't have personal beliefs or o...",1,0,0
1,53567,koala-13b,gpt-4-0613,"[""What is the difference between marriage lice...","[""A marriage license is a legal document that ...","[""A marriage license and a marriage certificat...",0,1,0
2,65089,gpt-3.5-turbo-0613,mistral-medium,"[""explain function calling. how would you call...","[""Function calling is the process of invoking ...","[""Function calling is the process of invoking ...",0,0,1
3,96401,llama-2-13b-chat,mistral-7b-instruct,"[""How can I create a test set for a very rare ...","[""Creating a test set for a very rare category...","[""When building a classifier for a very rare c...",1,0,0
4,198779,koala-13b,gpt-3.5-turbo-0314,"[""What is the best way to travel from Tel-Aviv...","[""The best way to travel from Tel Aviv to Jeru...","[""The best way to travel from Tel-Aviv to Jeru...",0,1,0


In [None]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57477 entries, 0 to 57476
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              57477 non-null  int64 
 1   model_a         57477 non-null  object
 2   model_b         57477 non-null  object
 3   prompt          57477 non-null  object
 4   response_a      57477 non-null  object
 5   response_b      57477 non-null  object
 6   winner_model_a  57477 non-null  int64 
 7   winner_model_b  57477 non-null  int64 
 8   winner_tie      57477 non-null  int64 
dtypes: int64(4), object(5)
memory usage: 3.9+ MB


# Preprocessing

In [None]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def clean_text(text):
    text = text.lower()
    text = re.sub("[^a-zA-Z0-9\s]", '', text)
    doc = nlp(text)
    text = " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct ])

    return text

In [None]:
train_data.prompt = train_data.prompt.apply(lambda x: clean_text(x))
train_data.response_a = train_data.response_a.apply(lambda x: clean_text(x))
train_data.response_b = train_data.response_b.apply(lambda x: clean_text(x))

In [None]:
train_data.to_csv('processed_train_data.csv', index=False)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [None]:
input_ids = []
attention_mask = []
labels = torch.tensor(train_data[['winner_model_a', 'winner_model_b', 'winner_tie']].values)
labels = labels.long()

sep_id = tokenizer.sep_token_id
sep = tokenizer.decode(sep_id)

for s1, s2, s3 in zip(train_data.prompt, train_data.response_a, train_data.response_b):
    concat = s1 + sep + s2 + sep + s3
    tokens = tokenizer(concat, max_length=512, truncation=True, padding='max_length', return_tensors='pt')
    input_ids.append(tokens['input_ids'])
    attention_mask.append(tokens['attention_mask'])

attention_mask_pt = torch.cat(attention_mask, dim=0)
input_ids_pt = torch.cat(input_ids, dim=0)

In [None]:
attention_mask_pt = attention_mask_pt.long()
input_ids_pt = input_ids_pt.long()
labels = torch.argmax(labels.long(), dim=1)

In [None]:
dataset = torch.utils.data.TensorDataset(input_ids_pt, attention_mask_pt, labels)

In [None]:
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

(len(train_dataset) + len(val_dataset)) == len(dataset)

True

In [None]:
from torch.utils.data import DataLoader

batch_size = 32

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=True
)

# Model

In [None]:
class Classifier(nn.Module):

    def __init__(self, num_class):
        super(Classifier, self).__init__()
        self.bert = AutoModel.from_pretrained('google-bert/bert-base-uncased',
                                              attention_probs_dropout_prob=0.1,
                                              hidden_dropout_prob=0.1
                                             )
        self.fc = nn.Linear(self.bert.config.hidden_size, num_class)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs_bert = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        pooler_output = outputs_bert['pooler_output']
        output = self.fc(pooler_output)

        logits = self.fc(pooler_output)

        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)

        return {"logits": logits, "loss": loss}


# Train Loop

In [None]:
model = Classifier(num_class=3)
model = model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
from torch.optim.lr_scheduler import StepLR

optimizer = torch.optim.AdamW(model.parameters(),
                              lr=2e-5,
                              eps=1e-8,
                              weight_decay=0.01)

scheduler = StepLR(optimizer, step_size=5, gamma=0.1)

In [None]:
def compute_acc(predictions, labels):
    return accuracy_score(labels, predictions)

In [None]:
label = next(iter(train_loader))[2]
label

tensor([1, 2, 1, 0, 1, 0, 2, 1, 2, 2, 1, 2, 2, 2, 0, 1, 2, 2, 2, 1, 1, 2, 2, 0,
        0, 1, 1, 1, 1, 0, 0, 1])

In [None]:
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    all_predictions = []
    all_labels = []

    for step, batch in tqdm(enumerate(train_loader), total=len(train_loader)):
        inpt_ids = batch[0].to(device)
        attent_mask = batch[1].to(device)
        label = batch[2].to(device)

        optimizer.zero_grad()

        output = model(input_ids=inpt_ids, attention_mask=attent_mask, labels=label)
        loss = output['loss']
        logits = output['logits']

        loss.backward()
        optimizer.step()
        scheduler.step()

        total_train_loss += loss.item()

        preds = torch.argmax(logits, dim=-1).cpu().numpy()
        all_predictions.extend(preds)
        all_labels.extend(label.cpu().numpy())

    acc = compute_acc(all_predictions, all_labels)
    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"  Training loss: {avg_train_loss:.4f}")
    print(f"  Training accuracy: {acc:.4f}")

    # Валидация
    model.eval()
    total_val_loss = 0
    all_val_predictions = []
    all_val_labels = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs['loss']
            logits = outputs['logits']

            total_val_loss += loss.item()

            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            all_val_predictions.extend(preds)
            all_val_labels.extend(labels.cpu().numpy())

    avg_val_loss = total_val_loss / len(val_loader)
    val_acc = compute_acc(all_val_predictions, all_val_labels)
    print(f"  Validation loss: {avg_val_loss:.4f}")
    print(f"  Validation accuracy: {val_acc:.4f}")



  0%|          | 0/1437 [00:00<?, ?it/s]

Epoch 1/3
  Training loss: 1.1013
  Training accuracy: 0.3432
  Validation loss: 1.1026
  Validation accuracy: 0.3406


  0%|          | 0/1437 [00:00<?, ?it/s]

Epoch 2/3
  Training loss: 1.0891
  Training accuracy: 0.3703
  Validation loss: 1.0818
  Validation accuracy: 0.3788


  0%|          | 0/1437 [00:00<?, ?it/s]

Epoch 3/3
  Training loss: 1.0767
  Training accuracy: 0.3933


In [None]:
torch.save(model.state_dict(), 'model_weights.pth')

In [None]:
!nvidia-smi