In [None]:
import pandas as pd
import re
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import accelerate
import torch.nn as nn
from torch.optim import Adam

In [None]:
from tqdm.auto import tqdm

In [None]:
train_data = pd.read_csv('hpc_space/MBERT5/train_data_mid_5.csv')
val_data = pd.read_csv('hpc_space/MBERT5/val_data_mid_5.csv')

In [None]:
train_data = train_data.sample(frac=1, random_state=42)

In [None]:
def clean_txt(text):
    text = re.sub("'", "", text)
    text = re.sub("(\\W)+", " ", text)
    return text

In [None]:
train_data['text'] = train_data.text.apply(clean_txt)
val_data['text'] = val_data.text.apply(clean_txt)

In [None]:
def get_split(text1):
    l_total = []
    chunk_size = 500
    overlap = 50

    words = text1.split()

    for start_idx in range(0, len(words), chunk_size - overlap):
        end_idx = start_idx + chunk_size
        l_parcial = words[start_idx:end_idx]
        l_total.append(" ".join(l_parcial))

    return l_total

In [None]:
train_data['text_split'] = train_data['text'].apply(get_split)
val_data['text_split'] = val_data['text'].apply(get_split)

In [None]:
train_l = []
label_l = []
index_l = []

for idx, row in train_data.iterrows():
    for l in row['text_split']:
        train_l.append(l)
        label_l.append(row['outcome'])
        index_l.append(idx)

In [None]:
val_l = []
val_label_l = []
val_index_l = []

for idx, row in val_data.iterrows():
    for l in row['text_split']:
        val_l.append(l)
        val_label_l.append(row['outcome'])
        val_index_l.append(idx)

In [None]:
train_df = pd.DataFrame({'text':train_l, 'label':label_l})
train_df.head()

In [None]:
val_df = pd.DataFrame({'text':val_l, 'label':val_label_l})
val_df.head()

In [None]:
train_texts = train_df['text'].tolist()
train_labels = train_df['label'].tolist()

In [None]:
eval_texts = val_df['text'].tolist()
eval_labels = val_df['label'].tolist()

In [None]:
import torchvision
import torchaudio

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
tokenizer = AutoTokenizer.from_pretrained("JoppeK/MBERT_copy")
model = AutoModelForSequenceClassification.from_pretrained("JoppeK/MBERT_copy", num_labels=2)

In [None]:
def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

In [None]:
from datasets import Dataset

In [None]:
raw_dataset_train = Dataset.from_pandas(train_df.reset_index(drop=True))
print(raw_dataset_train)

In [None]:
raw_dataset_val = Dataset.from_pandas(val_df.reset_index(drop=True))
print(raw_dataset_val)

In [None]:
tokenized_dataset_train = raw_dataset_train.map(tokenize_function, batched=True)
tokenized_dataset_train

In [None]:
tokenized_dataset_val = raw_dataset_val.map(tokenize_function, batched=True)

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
tokenized_dataset_train = tokenized_dataset_train.remove_columns(["text"])
tokenized_dataset_train = tokenized_dataset_train.rename_column("label", "labels")
tokenized_dataset_train.set_format("torch")
tokenized_dataset_train.column_names

In [None]:
tokenized_dataset_val = tokenized_dataset_val.remove_columns(["text"])
tokenized_dataset_val = tokenized_dataset_val.rename_column("label", "labels")
tokenized_dataset_val.set_format("torch")
tokenized_dataset_val.column_names

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_dataset_train, shuffle=True, batch_size=18, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_dataset_val, batch_size=18, collate_fn=data_collator
)

In [None]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

In [None]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

In [None]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

In [None]:
from tqdm.auto import tqdm
from transformers import get_scheduler
from accelerate import Accelerator
accelerator = Accelerator()

train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
    train_dataloader, eval_dataloader, model, optimizer
)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        class_weights = torch.tensor([1.0, 2.0], dtype=torch.float32).to(accelerator.device)
        weighted_loss = loss * class_weights[batch["labels"]]
        loss = weighted_loss.mean()
        accelerator.backward(loss)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
model.push_to_hub("MBERT_52")

In [None]:
tokenizer.push_to_hub("MBERT_52")