In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, DataCollatorWithPadding, TrainingArguments, Trainer, get_scheduler
from torch import device, cuda
import numpy as np
from datasets import load_dataset, Dataset, DatasetDict
import evaluate
import pandas as pd
from torch.utils.data import DataLoader

# setting up the Longformer model, tokenizer and datasets
checkpoint = "allenai/longformer-base-4096"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
data_files = {
    "deceptive": "./deceptive-opinion.csv",
    "fake_reviews": "./fake reviews dataset.csv"
}

# interleaving of the datasets
decep_df = pd.read_csv(data_files["deceptive"])
decep_df = decep_df.drop(columns=["hotel", "polarity", "source"])
decep_df = decep_df.rename(columns={"deceptive":"label"})
split_index = int(0.2 * len(decep_df))
decep_df_train = decep_df[split_index:]
decep_df_test = decep_df[:split_index]


fake_reviews = pd.read_csv(data_files["fake_reviews"]).sample(frac=1, random_state=42).reset_index(drop=True)[:7680]
fake_reviews = fake_reviews.rename(columns={"text_": "text"})
fake_reviews = fake_reviews.drop(columns=["category", "rating"])
split_index = int(0.2 * len(fake_reviews))
fk_train = fake_reviews[split_index:]
fk_test = fake_reviews[:split_index]

interleaved_train = pd.concat([fk_train, decep_df_train], ignore_index=True)
interleaved_test = pd.concat([fk_test, decep_df_test], ignore_index=True)
label_map = {
    "OR": 1,
    "CG": 0,
    "deceptive": 0,
    "truthful": 1
}
interleaved_train = interleaved_train.replace({"label": label_map})
interleaved_test = interleaved_test.replace({"label": label_map})
dataset = DatasetDict()
dataset["train"] = Dataset.from_pandas(interleaved_train)
dataset["validation"] = Dataset.from_pandas(interleaved_test)
dataset = dataset.class_encode_column("label")

# tokenize the text in the dataset so that it is understandable for the model
# truncate the tokens beacuse the Longformer transformer can only take 4,096 tokens at once
dataset = dataset.map(lambda x: tokenizer(x["text"], truncation=True), batched=True)
dataset = dataset.rename_column("label", "labels")
dataset = dataset.remove_columns(["text"])
dataset.set_format("torch")

# data collator forms a batch of data from a dataset and, in this case, adds padding as well
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

train_dataloader = DataLoader(
    dataset["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
validation_dataloader = DataLoader(
    dataset["validation"], batch_size=8, collate_fn=data_collator
)



# arguments for the trainer,specifies the name of the output directory and
# when evaluation happens. Had error with lack of memory, had to implement
# gradient accumulation and reduce the training batch for each gpu
num_of_epochs = 4
training_args = TrainingArguments(
    "longformer-fake-review-detector-combined-v2",
    evaluation_strategy="epoch",
    gradient_accumulation_steps=8,
    per_device_train_batch_size=4,
    num_train_epochs=num_of_epochs,
#     hub_private_repo=True,
#     save_strategy="epoch",
#     push_to_hub=True,
)

optimizer = AdamW(model.parameters(), lr=5e-5)

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=int(num_of_epochs * len(train_dataloader))
)

def compute_metrics(validation_predictions):
    metric1 = evaluate.load("accuracy")
    metric2 = evaluate.load("f1")
    metric3 = evaluate.load("precision")
    metric4 = evaluate.load("recall")
    logits, labels = validation_predictions
    predictions = np.argmax(logits, axis=-1)
    accuracy = metric1.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = metric2.compute(predictions=predictions, references=labels)["f1"]
    precision = metric3.compute(predictions=predictions, references=labels)["precision"]
    recall = metric4.compute(predictions=predictions, references=labels)["recall"]
    return {"accuracy": accuracy, "f1": f1, "precision": precision, "recall": recall}

trainer = Trainer(
    model,
    training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

device = device("cuda" if cuda.is_available() else "cpu")

trainer.train()

In [None]:
model.config.id2label = {
    0: "deceptive",
    1: "truthful"
}
model.config.label2id = {
    "deceptive": 0,
    "truthful": 1
}