# Load model

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer


model_path = "distilbert/distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Dataset 

In [None]:
from article_classifier.dataset import load_arxiv_dataset

dataset = load_arxiv_dataset()

### Data preliminary analysis

In [None]:
from collections import Counter


category_counts = Counter()

for term in dataset["terms"]:
    for category in term:
        category_counts[category] += 1

In [None]:
category_counts.most_common(20)

[('cs.CV', 30413),
 ('cs.LG', 29067),
 ('stat.ML', 15578),
 ('cs.AI', 7944),
 ('eess.IV', 2484),
 ('cs.RO', 1896),
 ('cs.CL', 1620),
 ('cs.NE', 1296),
 ('cs.CR', 717),
 ('cs.SI', 678),
 ('math.OC', 666),
 ('eess.SP', 621),
 ('cs.GR', 583),
 ('cs.MM', 523),
 ('cs.SY', 444),
 ('cs.IR', 442),
 ('cs.MA', 375),
 ('cs.HC', 359),
 ('eess.SY', 345),
 ('stat.AP', 294)]

1. make trainable categories.
2. train test split.
3. preprocess to tokens ids. 

In [None]:
from article_classifier.dataset import id2label, label2id, create_prompt


In [None]:
def preprocess_add_simple_categories(example):
    labels = [0.] * len(id2label)
    labels[label2id["CV"]] = float("cs.CV" in example["terms"])
    labels[label2id["AI"]] = float("cs.AI" in example["terms"])
    labels[label2id["ML"]] = float(("stat.ML" in example["terms"]) or ("cs.LG" in example["terms"]))
    labels[label2id["NE"]] = float("cs.NE" in example["terms"])
    labels[label2id["CL"]] = float("cs.CL" in example["terms"])
    example["labels"] = labels
    return example

dataset = dataset.map(preprocess_add_simple_categories)

In [None]:
df = dataset.to_pandas()
df.head()

Unnamed: 0,titles,summaries,terms,labels
0,Survey on Semantic Stereo Matching / Semantic ...,Stereo matching is one of the widely used tech...,"[cs.CV, cs.LG]","[1.0, 0.0, 1.0, 0.0, 0.0]"
1,FUTURE-AI: Guiding Principles and Consensus Re...,The recent advancements in artificial intellig...,"[cs.CV, cs.AI, cs.LG]","[1.0, 1.0, 1.0, 0.0, 0.0]"
2,Enforcing Mutual Consistency of Hard Regions f...,"In this paper, we proposed a novel mutual cons...","[cs.CV, cs.AI]","[1.0, 1.0, 0.0, 0.0, 0.0]"
3,Parameter Decoupling Strategy for Semi-supervi...,Consistency training has proven to be an advan...,[cs.CV],"[1.0, 0.0, 0.0, 0.0, 0.0]"
4,Background-Foreground Segmentation for Interio...,"To ensure safety in automated driving, the cor...","[cs.CV, cs.LG]","[1.0, 0.0, 1.0, 0.0, 0.0]"


In [None]:
print(df.iloc[0].titles)
print()
print(df.iloc[0].summaries)

Survey on Semantic Stereo Matching / Semantic Depth Estimation

Stereo matching is one of the widely used techniques for inferring depth from
stereo images owing to its robustness and speed. It has become one of the major
topics of research since it finds its applications in autonomous driving,
robotic navigation, 3D reconstruction, and many other fields. Finding pixel
correspondences in non-textured, occluded and reflective areas is the major
challenge in stereo matching. Recent developments have shown that semantic cues
from image segmentation can be used to improve the results of stereo matching.
Many deep neural network architectures have been proposed to leverage the
advantages of semantic segmentation in stereo matching. This paper aims to give
a comparison among the state of art networks both in terms of accuracy and in
terms of speed which are of higher importance in real-time applications.


## Dataset preparation

In [None]:
dataset_counter = 0
def preprocess_function(examples):
    text = []
    for title, abstract in zip(examples["titles"], examples["summaries"]):
        if dataset_counter % 5 == 0: # adding some entities without abstracts
            text.append(create_prompt(title, ""))
        else:
            text.append(create_prompt(title, abstract))
    return tokenizer(text, truncation=True)


In [None]:
tokenized_arxiv = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/51774 [00:00<?, ? examples/s]

In [None]:
tokenized_arxiv

Dataset({
    features: ['titles', 'summaries', 'terms', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 51774
})

In [None]:
split_dataset = tokenized_arxiv.train_test_split(test_size=0.2, seed=42)

print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['titles', 'summaries', 'terms', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 41419
    })
    test: Dataset({
        features: ['titles', 'summaries', 'terms', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 10355
    })
})


In [None]:
type(split_dataset["train"]["labels"][:5][0][0])

float

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


## Pipeline preparation

In [None]:
import evaluate

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def compute_metrics(eval_pred):

   predictions, labels = eval_pred
   predictions = (predictions > 0).astype(int).reshape(-1)
   return clf_metrics.compute(predictions=predictions, references=labels.astype(int).reshape(-1))


# Training

In [None]:

model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id,
    problem_type="multi_label_classification",
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="~/.cache/huggingface/checkpoints/distilbert-arxiv2",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="steps",
    save_strategy="epoch",
    report_to="all",
    run_name="DistillBertFinetuning_With Skipped Abstracts",
    logging_steps=20,
    eval_on_start=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"].select(range(20 * 16)),
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:

trainer.train()


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,0.67125,0.778125,0.672811,0.570312,0.820225
20,0.544600,0.456104,0.778125,0.672811,0.570312,0.820225
40,0.415200,0.433664,0.771875,0.562874,0.602564,0.52809
60,0.405500,0.390882,0.816875,0.698249,0.644487,0.761798
80,0.381500,0.355134,0.856875,0.722424,0.784211,0.669663
100,0.333400,0.336038,0.8675,0.733668,0.831909,0.65618
120,0.317300,0.313339,0.86875,0.740741,0.821918,0.674157
140,0.297600,0.313638,0.874375,0.750929,0.837017,0.680899
160,0.340000,0.305131,0.881875,0.764045,0.859551,0.68764
180,0.314800,0.294592,0.88375,0.769231,0.858726,0.696629


KeyboardInterrupt: 

In [None]:
import wandb
wandb.finish()

wandb: ERROR Unable to save notebook session history.


0,1
eval/accuracy,▇▁▁▁▃▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇██▇▇███▇█▇█▇▇█
eval/f1,▇▄▄▁▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇█▇▇▇▇▇█▇█▇█▇▇█
eval/loss,▂█▄▄▃▃▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/model_preparation_time,▁
eval/precision,▆▁▁▂▃▅▆▆▆▇▇▇▇▇▇██▇▇██████▇██▇██▇█▇█▇
eval/recall,▆██▁▇▄▄▅▅▅▅▅▅▅▄▄▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▆▅▅▅
eval/runtime,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/samples_per_second,▄▁▇▇▇█▆▆▇▆█▇█▇▆▅▅▆▇▇█▇█▇▇▇▆█▇▇▇█▅▆▅▇
eval/steps_per_second,█▁▃▃▄▄▃▃▃▃▄▃▄▃▃▂▃▃▃▄▄▄▄▄▃▄▃▄▃▃▃▄▃▃▃▃
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇███

0,1
eval/accuracy,0.89438
eval/f1,0.79006
eval/loss,0.26952
eval/model_preparation_time,0.0029
eval/precision,0.88333
eval/recall,0.71461
eval/runtime,0.5643
eval/samples_per_second,567.042
eval/steps_per_second,35.44
train/epoch,0.26265


In [None]:
import huggingface_hub


huggingface_hub.notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Save model and tokenizer locally
# local_dir = "./distilbert-arxiv-checkpoint"
import os


local_dir = os.path.expanduser("~/.cache/huggingface/checkpoints/distilbert-arxiv2")
model.save_pretrained(local_dir)
tokenizer.save_pretrained(local_dir)

# Push model and tokenizer to Hugging Face Hub
model.push_to_hub("distilbert-arxiv-checkpoint")
tokenizer.push_to_hub("distilbert-arxiv-checkpoint")

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


CommitInfo(commit_url='https://huggingface.co/Hacker1337/distilbert-arxiv-checkpoint/commit/e9a5368700574158e6deeb2a93db39b6b97b1971', commit_message='Upload tokenizer', commit_description='', oid='e9a5368700574158e6deeb2a93db39b6b97b1971', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Hacker1337/distilbert-arxiv-checkpoint', endpoint='https://huggingface.co', repo_type='model', repo_id='Hacker1337/distilbert-arxiv-checkpoint'), pr_revision=None, pr_num=None)

In [None]:
import os


local_dir = os.path.expanduser("~/.cache/huggingface/checkpoints/distilbert-arxiv2")
from transformers import AutoModelForSequenceClassification, AutoTokenizer
model = AutoModelForSequenceClassification.from_pretrained(local_dir)
tokenizer = AutoTokenizer.from_pretrained(local_dir)

### Testing on entities with only titles. 

In [None]:
def preprocess_titles_function(examples):
    text = [create_prompt(title, "") for title, abstract in zip(examples["titles"], examples["summaries"])]
    # text = ["# title:\n" + title + "\n# abstract:\n" + abstract for title, abstract in zip(examples["titles"], examples["summaries"])]
    return tokenizer(text, truncation=True)


In [None]:
tokenized_titles_only_test_arxiv = dataset.train_test_split(test_size=0.2, seed=42)["test"].map(preprocess_titles_function, batched=True)

In [None]:
training_args = TrainingArguments(
    per_device_eval_batch_size=16,
    run_name="DistillBertFinetuning_2",
)

trainer = Trainer(
    model=model,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
eval_results = trainer.evaluate(tokenized_titles_only_test_arxiv)
print(eval_results)

{'eval_loss': 0.2729688584804535, 'eval_model_preparation_time': 0.0032, 'eval_accuracy': 0.8944664413326895, 'eval_f1': 0.7889695658890777, 'eval_precision': 0.8737382378100941, 'eval_recall': 0.7191944796507534, 'eval_runtime': 19.5662, 'eval_samples_per_second': 529.23, 'eval_steps_per_second': 66.186}


# Publishing model to huggingface hub