In [None]:
!nvidia-smi

In [None]:
!pip install -q transformers[torch] tensorboardx simpletransformers
!pip install accelerate -U

In [None]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from urllib import request
import pandas as pd
import logging
import torch
from collections import Counter
from ast import literal_eval
from sklearn.model_selection import train_test_split
import torch
import os
import gc
import random

import matplotlib.pyplot as plt
import numpy as np

# Import Data

In [None]:
file_id = "1XOafk3wcP2RcTu1MHXoR_IJBIZseqTn8"
url = f"https://drive.google.com/uc?id={file_id}"

train_model_df = pd.read_csv(url, sep="\t")
train_model_df = train_model_df.loc[:, ~train_model_df.columns.str.contains('^Unnamed')]

# Split Train and Validation

In [None]:
train_df, val_df, _, _ = train_test_split(
    train_model_df,
    train_model_df["target_flag"],
    test_size=0.2,
    random_state=42,
    stratify=train_model_df["target_flag"]
)

print(f"train size: {len(train_df)}, target_rate: {train_df.agg({'target_flag': 'mean'})}")
print(f"val size: {len(val_df)}, target_rate: {val_df.agg({'target_flag': 'mean'})}")

In [None]:
selected_cols = ["keyword", "text", "target_flag"]
train_df = train_df[selected_cols]
val_df = val_df[selected_cols]

train_df.head()

# Functions

In [None]:
def get_device():
    if torch.cuda.is_available():
        return torch.device("cuda")
    elif torch.backends.mps.is_available():
        return torch.device("mps")
    else:
        return torch.device("cpu")

def plot_f1(train_f1_list, val_f1_list):

    fig, ax = plt.subplots()
    ax.plot(np.arange(len(train_f1_list)), train_f1_list, label="train")
    ax.plot(np.arange(len(val_f1_list)), val_f1_list, label="val")
    ax.legend()
    plt.show()

def augment_text(row, deletion_prob=0.0, swap_prob=0.7, pos=3):
    # Tokenize the text
    tokens = row['text'].split()

    n_pos = 1*pos

    for i in range(len(tokens)-n_pos):
        if random.random() < swap_prob:
            #swap_i = random.randint(0, len(tokens)-1)
            tokens[i], tokens[i + n_pos] = tokens[i + n_pos], tokens[i]

    tokens = [token for token in tokens if random.random() > deletion_prob]

    # Reconstruct the augmented text
    augmented_text = ' '.join(tokens)
    return augmented_text

def train_model(train_df, val_df, custom_args, cols=['text', 'target_flag'], epochs=3, is_save=True, is_swap=False, start_swap_epoch=0):

    save_path = f'/models/{str(custom_args["learning_rate"]):.4}_{str(custom_args["weight_decay"]):.4}'

    if not os.path.exists(save_path):
        os.makedirs(save_path)

    # Create a ClassificationModel with custom hyperparameters
    model = ClassificationModel(
        "distilbert",
        "distilbert-base-uncased",
        num_labels=2,
        args=custom_args
    )

    train_f1_list = []
    val_f1_list = []

    best_f1 = -1

    for i in range(epochs):

        _train_df = train_df.copy()
        if is_swap and i >= start_swap_epoch:
            # _train_df["text"] =  _train_df.apply(augment_text, axis=1)
            _train_df["text"] = _train_df.apply(lambda row: augment_text(row, pos=i+1), axis=1)

        model.train_model(_train_df[cols], eval_df=val_df[cols])

        train_result, train_model_outputs, train_wrong_predictions = model.eval_model(_train_df[cols])
        val_result, val_model_outputs, val_wrong_predictions = model.eval_model(val_df[cols])
        train_f1_list.append(train_result["f1_score"])
        val_f1_list.append(val_result["f1_score"])

        if val_result["f1_score"] > best_f1:
            best_f1 = val_result["f1_score"]
            if is_save:
                model.model.save_pretrained(save_path)
                model.tokenizer.save_pretrained(save_path)
                model.config.save_pretrained(f'{save_path}/')
                if not os.path.isfile(os.path.join(save_path, 'config.json')):
                    raise Exception("Model not saved correctly. 'config.json' not found.")
        else:
            print(f"Early stop at: {i}")
            break

        print(f"Epoch {i} train: {train_result['f1_score']}, val: {val_result['f1_score']}")

    if is_save:
        best_model = ClassificationModel(
            "distilbert",
            save_path,
            # num_labels=2,  # Ensure this matches the original model's configuration
        )
    else:
        best_model = model

    train_result, train_model_outputs, train_wrong_predictions = best_model.eval_model(_train_df[cols])
    val_result, val_model_outputs, val_wrong_predictions = best_model.eval_model(val_df[cols])

    return best_model, train_result["f1_score"], val_result["f1_score"], train_f1_list, val_f1_list

# Paraphrase by T5

In [None]:
target_one_text = train_df.loc[train_df["target_flag"] == 1.0, "text"].tolist()

device = get_device()

tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws")
model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws").to(device)


paraphrase_list = []

for i, sentence in tqdm(enumerate(target_one_text)):

    text =  "paraphrase: " + sentence + " </s>"

    encoding = tokenizer.encode_plus(text, pad_to_max_length=True, return_tensors="pt")
    input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

    outputs = model.generate(
        input_ids=input_ids, attention_mask=attention_masks,
        max_length=256,
        do_sample=True,
        top_k=240,
        top_p=0.99,
        early_stopping=True,
        num_return_sequences=1
    )

    for output in outputs:
        line = tokenizer.decode(output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
        paraphrase_list.append(line)

    torch.mps.empty_cache()
    gc.collect()

In [None]:
paraphrase_dict = {
    "text": paraphrase_list,
    "target_flag": 1.0,
}

paraphrase_df_1 = pd.DataFrame(paraphrase_dict)
train_all_df = pd.concat([paraphrase_df_1, train_df])

# Train Model

In [None]:
batch_size = 32

best_params = {
    "learning_rate": 3e-5,
    "train_batch_size": batch_size,
    "eval_batch_size": batch_size,
    "weight_decay": 0.01,
    "optimizer": "AdamW",
    "num_train_epochs": 1,
    "dropout_rate": 0.1,
    "overwrite_output_dir": True,
}

cols = ['text', 'target_flag']


torch.cuda.empty_cache()
gc.collect()

model, train_f1, val_f1, train_f1_list, val_f1_list = train_model(
    train_all_df,
    val_df,
    best_params,
    cols=cols,
    epochs=5,
    is_save=True,
    is_swap=True,
    start_swap_epoch=1
)