In [None]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

import torch
import numpy as np
import matplotlib.pyplot as plt
from balanced_loss import Loss
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader, Dataset
from pathlib import Path
from .data.constants import LOCAL_MODELS_PATH, CHECKPOINTS_PATH

dataset_path = ''
df = pd.read_parquet(dataset_path)

BASE_MODEL_PATH = LOCAL_MODELS_PATH / ''
MAX_LEN = 512
batch_size = 8
TRAIN_BATCH_SIZE = batch_size
VALID_BATCH_SIZE = batch_size
TEST_BATCH_SIZE = batch_size

LIMIT_NUM_MODELS = 2

SEED = 2
DEVICE = 'cuda' if torch.cuda.is_avaliable() else 'cpu'
LEARNING_RATE = 2e-5
N_EPOCHS = 3

model_name = BASE_MODEL_PATH.name
CHECKPOINTS_DIR = CHECKPOINTS_PATH / model_name / 'checkpoints'
CHECKPOINTS_DIR.mkdir(exist_ok=True, parants=True)
MODEL_TO_SAVE_TEMPLE = 'model-{epoch}-epoch.pt'

In [None]:
df_train, df_test = train_test_split(df, test_size=0.2, shuffle=True, random_state=42)
df_train, df_valid = train_test_split(df, test_size=0.1, shuffle=True, random_state=42)

df_train.reset_index(drop=True, inplace=True)
df_valid.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

In [None]:
class BERTDataset(Dataset):
    def __init__(self, df, tokenizer_path, input_column, output_column, max_len):
        self.max_len = max_len
        self.df = df
        self.input_column = input_column
        self.output_column = output_column
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        row = self.df.iloc[index]

        inputs = self.tokenizer.encode_plus(
            row[self.input_column],
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            return_attention_mask=True,
            trundaction=True,
            return_tensors='pt'
        )
        inputs = {key: value.flatten() for key, value in inputs.items()}
        return dict(**inputs, labels=torch.tensor(row[self.class_column_name], dtype=torch.long))

train_dataset = BERTDataset(df_train, BASE_MODEL_PATH, 'Target', 'Drug', MAX_LEN)
valid_dataset = BERTDataset(df_valid, BASE_MODEL_PATH, 'Target', 'Drug', MAX_LEN)
test_dataset = BERTDataset(df_test, BASE_MODEL_PATH, 'Target', 'Drug', MAX_LEN)


In [1]:
train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, generator=torch.manual_seed(SEED), num_workers=8, shuffle=True, pin_memory=True)
vallid_loader = DataLoader(valid_dataset, batch_size=VALID_BATCH_SIZE, generator=torch.manual_seed(SEED), num_workers=8, shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=TEST_BATCH_SIZE, generator=torch.manual_seed(SEED), num_workers=8, shuffle=True, pin_memory=True)

NameError: name 'DataLoader' is not defined

In [None]:
model = AutoModel.from_pretrained(BASE_MODEL_PATH)
for param in model.bert.parameters():
    param.requires_grad = True

model = model.to(DEVICE)