In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
import json
import pandas as pd
from sklearn.metrics import accuracy_score

# Model and Tokenizer

In [2]:
tokenizer = AutoTokenizer.from_pretrained("roberta-large-openai-detector")
model = AutoModelForSequenceClassification.from_pretrained("roberta-large-openai-detector",from_flax=True)

  pt_model_dict[flax_key] = torch.from_numpy(flax_tensor)
All Flax model weights were used when initializing RobertaForSequenceClassification.

Some weights of RobertaForSequenceClassification were not initialized from the Flax model and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Data

In [3]:
data = []

with open("/kaggle/input/detection-of-ai-generated-texts/train.jsonl", "r") as file:
    for line in file:
        record = json.loads(line)
        data.append({"text": record["abstract"], "label": 0})
        data.append({"text": record["summary"], "label": 0})
        data.append({"text": record["generated"], "label": 1})

In [4]:
def preprocess_data(data):
    tokenized_data = tokenizer(data["text"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    tokenized_data["input_ids"] = tokenized_data["input_ids"].squeeze(0)
    tokenized_data["attention_mask"] = tokenized_data["attention_mask"].squeeze(0)
    return tokenized_data

preprocessed_dataset = [preprocess_data(item) for item in data]

In [5]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        item["labels"] = torch.tensor(self.labels[idx])
        return item

In [6]:
labels = [item["label"] for item in data]
dataset = CustomDataset(preprocessed_dataset, labels)

In [7]:
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Setting Training

In [8]:
# Define the accuracy function
def compute_accuracy(p):
    preds = p.predictions.argmax(-1)
    return {"accuracy": accuracy_score(p.label_ids, preds)}

In [9]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="epoch",
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_accuracy,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [10]:
trainer.train()



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33myongsuk[0m ([33mmaok[0m). Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




Epoch,Training Loss,Validation Loss,Accuracy
1,0.1661,0.569916,0.678014
2,0.3097,0.222822,0.934752
3,0.2257,0.193091,0.926241




RuntimeError: [enforce fail at inline_container.cc:325] . unexpected pos 1032617344 vs 1032617232

In [None]:
trainer.evaluate()

# Prediction and Sunmission

In [None]:
test_data = []

with open("/kaggle/input/detection-of-ai-generated-texts/test.jsonl", "r") as file:
    for line in file:
        record = json.loads(line)
        test_data.append({"id":record["id"],
                     "abstract": record["abstract"], 
                     "summary1": record["summary1"],
                     "summary2": record["summary2"]})

In [None]:
def preprocess_test_data(data):
    return tokenizer(data["abstract"], data["summary1"], return_tensors="pt"), tokenizer(data["abstract"], data["summary2"], return_tensors="pt")

preprocessed_test_data = [preprocess_test_data(item) for item in test_data]

In [None]:
def predict_summary_class(model, input_data1, input_data2):
    device = model.device
    
    # Move input tensors to the same device as the model
    input_data1 = {key: tensor.to(device) for key, tensor in input_data1.items()}
    input_data2 = {key: tensor.to(device) for key, tensor in input_data2.items()}
    
    with torch.no_grad():
        outputs1 = model(**input_data1)
        outputs2 = model(**input_data2)
    class1 = torch.argmax(outputs1.logits, dim=-1).item()
    class2 = torch.argmax(outputs2.logits, dim=-1).item()
    
    return 0 if class1 < class2 else 1

predictions = [{"id": item["id"], "class": predict_summary_class(model, input_data1, input_data2)} for item, (input_data1, input_data2) in zip(test_data, preprocessed_test_data)]

In [None]:
df_predictions = pd.DataFrame(predictions)

In [None]:
submit = pd.DataFrame({'id':df_predictions["id"],
                      'answer':df_predictions["class"]})

submit.to_csv('submission03-rob-lg.csv', index=False)

--- Logging error ---
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/logging/__init__.py", line 1029, in emit
    self.flush()
  File "/opt/conda/lib/python3.7/logging/__init__.py", line 1009, in flush
    self.stream.flush()
OSError: [Errno 28] No space left on device
Call stack:
  File "/opt/conda/lib/python3.7/threading.py", line 890, in _bootstrap
    self._bootstrap_inner()
  File "/opt/conda/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 49, in run
    self._run()
  File "/opt/conda/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 100, in _run
    self._process(record)
  File "/opt/conda/lib/python3.7/site-packages/wandb/sdk/internal/internal.py", line 329, in _process
    self._sm.send(record)
  File "/opt/conda/lib/python3.7/site-packages/wandb/sdk/internal/sender.py", line 343, in send
    send_handler(record)
  File "/