# File Configuration

In [None]:
# !pip install colabcode
!pip install transformers
!pip install datasets
!pip install sentencepiece

In [None]:
# from colabcode import ColabCode
# ColabCode(port=10000, authtoken='27ysitkWKvmDhN0bgIqQq5Tdy8a_5zyZ8Tn8L2JezxYi1uYds', mount_drive=True)

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
%cd /content/gdrive/MyDrive/Kaggle\ Competition/US_Patent

/content/gdrive/MyDrive/Kaggle Competition/US_Patent


In [None]:
!nvidia-smi

# Imports

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import sentencepiece
from datasets import load_metric
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer, AdamW
from transformers import BertTokenizerFast, BertTokenizer, BertForSequenceClassification
from transformers import DebertaV2ForSequenceClassification, DebertaV2Tokenizer
from torch.nn import BCEWithLogitsLoss, BCELoss
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
# specify GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


# Model Config

In [None]:
class CFG:
  train_path = 'data/stage2_data/train_df.csv'
  val_path = 'data/stage2_data/val_df.csv'
  test_path = 'data/stage2_data/test_df.csv'
  real_test_path = 'data/test_df.csv'
  # model_path = 'distilbert-base-cased'
  model_path = 'microsoft/deberta-v3-base'
  local_model_path = 'Matthew_models/model_3'

  learning_rate = 1e-4
  weight_decay = 0.01
  num_fold = 5
  epochs = 5
  batch_size = 32

# Data & Dataset

In [None]:
df = pd.read_csv('data/processed/train.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,id,anchor,target,context,score,context_text,concat,concat_vec,code,title,section,class,subclass,group,main_group
0,0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,Human Necessities,abatement # abatement of pollution # Human Nec...,"[101, 19557, 18532, 4765, 1001, 19557, 18532, ...",A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,
1,1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,Human Necessities,abatement # act of abating # Human Necessities,"[101, 19557, 18532, 4765, 1001, 2552, 1997, 19...",A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,
2,2,36d72442aefd8232,abatement,active catalyst,A47,0.25,Human Necessities,abatement # active catalyst # Human Necessities,"[101, 19557, 18532, 4765, 1001, 3161, 16771, 1...",A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,
3,3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,Human Necessities,abatement # eliminating process # Human Necess...,"[101, 19557, 18532, 4765, 1001, 15349, 2832, 1...",A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,
4,4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,Human Necessities,abatement # forest region # Human Necessities,"[101, 19557, 18532, 4765, 1001, 3224, 2555, 10...",A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,


In [None]:
class TrainDataset(torch.utils.data.Dataset):
    def __init__(self, df, input='concat', output='score'):
        self.inputs = (df[input]).values.astype(str)
        self.labels = (df[output] * 4).values.astype(int)

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, item):
        inputs = self.inputs[item]
        labels = self.labels[item]

        return {
          **tokenizer(inputs),
          'labels': labels
        }

In [None]:
train_df, val_df, test_df = np.split(df.sample(frac=1, random_state=42), [int(0.8*len(df)), int(0.9*len(df))])

In [None]:
# train_df.to_csv(CFG.train_path, index=False)
# val_df.to_csv(CFG.val_path, index=False)
# test_df.to_csv(CFG.test_path, index=False)

train_df = pd.read_csv(CFG.train_path)
val_df = pd.read_csv(CFG.val_path)
test_df = pd.read_csv(CFG.test_path)

# Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Preprocessing

# Training

In [None]:
import numpy as np
def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)
  # print('++++++++++++++++++++++++++++++++++++++++++++++++')
  # print(predictions)
  # print('++++++++++++++++++++++++++++++++++++++++++++++++')
  # print(labels)
  metric = load_metric("pearsonr")
  computed = metric.compute(predictions=predictions, references=labels)

  return computed

In [None]:
def train():
  train_dataset = TrainDataset(train_df, input='title')
  val_dataset = TrainDataset(val_df, input='title')

  args = TrainingArguments(
        output_dir='models/model_2',
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=CFG.learning_rate,
        per_device_train_batch_size=CFG.batch_size,
        per_device_eval_batch_size=CFG.batch_size,
        num_train_epochs=CFG.epochs,
        weight_decay=CFG.weight_decay,
        metric_for_best_model="pearsonr",
        load_best_model_at_end=True,
    )
  
  model = AutoModelForSequenceClassification.from_pretrained(CFG.model_path, num_labels=5)

  trainer = Trainer(
      model,
      args,
      train_dataset=train_dataset,
      eval_dataset=val_dataset,
      tokenizer=tokenizer,
      compute_metrics=compute_metrics
  )

  trainer.train()
  trainer.save_model('models/model_2')

  return trainer

In [None]:
# model = train()
model = AutoModelForSequenceClassification.from_pretrained(CFG.local_model_path, num_labels=5)


In [None]:
model.eval(eval_dataset=TrainDataset(train_df))

In [None]:
model.evaluate(eval_dataset=TrainDataset(train_df))