In [11]:
import evaluate
import transformers
import numpy as np
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DistilBertForSequenceClassification,
)

In [12]:
"""
Download dataset SubtaskA.jsonl from 
https://github.com/mbzuai-nlp/M4GT-Bench.
"""
DATA_PATH = "C:/Users/Admin/Desktop/cse847_proj/SubtaskB.jsonl"

# initialize dataset
df = pd.read_json(DATA_PATH, lines=True)
df = df[['text', 'label', 'model']]
dataset = Dataset.from_pandas(df)

# split dataset
a = dataset.train_test_split(test_size=0.20)
b = a['test'].train_test_split(test_size=0.5)
dataset = DatasetDict({
    'train': a['train'],
    'valid': b['train'],
    'test': b['test'],
})
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'model'],
        num_rows: 98248
    })
    valid: Dataset({
        features: ['text', 'label', 'model'],
        num_rows: 12281
    })
    test: Dataset({
        features: ['text', 'label', 'model'],
        num_rows: 12282
    })
})


In [13]:
print(df.label.unique())
df[['text', 'label']]

[2 1 0 3 5 4 6]


Unnamed: 0,text,label
0,We consider a system of many polymers in solut...,2
1,We present a catalog of 66 YSOs in the Serpens...,2
2,Spectroscopic Observations of the Intermediate...,2
3,We present a new class of stochastic Lie group...,2
4,ALMA as the ideal probe of the solar chromosph...,2
...,...,...
122806,Title: The Unsung Heroes: Seagoing Cowboys and...,0
122807,Title: The Benefits of Autonomy: Student-led P...,0
122808,"The Electoral College system, established by t...",0
122809,"In the ever-evolving landscape of education, c...",0


In [14]:
"""
Initialize tokenizer and model.
"""
model_id = "distilbert-base-uncased"

# init tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# init model
model = DistilBertForSequenceClassification.from_pretrained(
    model_id,
    num_labels=7,
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
"""
Tokenize dataset.
"""
def tokenize(X):
    return tokenizer(
        X["text"],
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )

# tokenize data
tokenized_datasets = dataset.map(tokenize, batched=True)
print(tokenized_datasets)

Map:   0%|          | 0/98248 [00:00<?, ? examples/s]

Map:   0%|          | 0/12281 [00:00<?, ? examples/s]

Map:   0%|          | 0/12282 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'model', 'input_ids', 'attention_mask'],
        num_rows: 98248
    })
    valid: Dataset({
        features: ['text', 'label', 'model', 'input_ids', 'attention_mask'],
        num_rows: 12281
    })
    test: Dataset({
        features: ['text', 'label', 'model', 'input_ids', 'attention_mask'],
        num_rows: 12282
    })
})


In [16]:
"""
Create dataset splits.
"""
seed = 777
n_samples = 10_000
n_test = 1000

train_dataset = tokenized_datasets["train"].shuffle(seed=seed).select(range(n_samples))
valid_dataset = tokenized_datasets["valid"].shuffle(seed=seed).select(range(n_test))
test_dataset = tokenized_datasets["test"].shuffle(seed=seed).select(range(n_test))

In [17]:
"""
Create Trainer.
"""
# define metric
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# training args
training_args = TrainingArguments(
    output_dir="C:/Users/Admin/Desktop/cse847_proj/",
    eval_strategy="epoch",
    save_total_limit=2,
)

# init trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
"""
Train model.
"""
trainer.train()

Epoch,Training Loss,Validation Loss


In [None]:
"""
Evaluate trained model.
"""
trainer.evaluate(test_dataset)

In [None]:
"""
Summarize model.
"""
print(model)