# Load model

In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer


model_path = "distilbert/distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Dataset 

In [2]:
from article_classifier.dataset import load_arxiv_dataset

dataset = load_arxiv_dataset()

### Data preliminary analysis

In [3]:
from collections import Counter


category_counts = Counter()

for term in dataset["terms"]:
    for category in term:
        category_counts[category] += 1

In [4]:
category_counts.most_common(20)

[('cs.CV', 30413),
 ('cs.LG', 29067),
 ('stat.ML', 15578),
 ('cs.AI', 7944),
 ('eess.IV', 2484),
 ('cs.RO', 1896),
 ('cs.CL', 1620),
 ('cs.NE', 1296),
 ('cs.CR', 717),
 ('cs.SI', 678),
 ('math.OC', 666),
 ('eess.SP', 621),
 ('cs.GR', 583),
 ('cs.MM', 523),
 ('cs.SY', 444),
 ('cs.IR', 442),
 ('cs.MA', 375),
 ('cs.HC', 359),
 ('eess.SY', 345),
 ('stat.AP', 294)]

1. make trainable categories.
2. train test split.
3. preprocess to tokens ids. 

In [5]:
from article_classifier.dataset import labels, id2label, label2id, categorie2human


In [6]:
def preprocess_add_simple_categories(example):
    labels = [0.] * len(id2label)
    labels[label2id["CV"]] = float("cs.CV" in example["terms"])
    labels[label2id["AI"]] = float("cs.AI" in example["terms"])
    labels[label2id["ML"]] = float(("stat.ML" in example["terms"]) or ("cs.LG" in example["terms"]))
    labels[label2id["NE"]] = float("cs.NE" in example["terms"])
    labels[label2id["CL"]] = float("cs.CL" in example["terms"])
    example["labels"] = labels
    return example

dataset = dataset.map(preprocess_add_simple_categories)

In [7]:
df = dataset.to_pandas()
df.head()

Unnamed: 0,titles,summaries,terms,labels
0,Survey on Semantic Stereo Matching / Semantic ...,Stereo matching is one of the widely used tech...,"[cs.CV, cs.LG]","[1.0, 0.0, 1.0, 0.0, 0.0]"
1,FUTURE-AI: Guiding Principles and Consensus Re...,The recent advancements in artificial intellig...,"[cs.CV, cs.AI, cs.LG]","[1.0, 1.0, 1.0, 0.0, 0.0]"
2,Enforcing Mutual Consistency of Hard Regions f...,"In this paper, we proposed a novel mutual cons...","[cs.CV, cs.AI]","[1.0, 1.0, 0.0, 0.0, 0.0]"
3,Parameter Decoupling Strategy for Semi-supervi...,Consistency training has proven to be an advan...,[cs.CV],"[1.0, 0.0, 0.0, 0.0, 0.0]"
4,Background-Foreground Segmentation for Interio...,"To ensure safety in automated driving, the cor...","[cs.CV, cs.LG]","[1.0, 0.0, 1.0, 0.0, 0.0]"


In [None]:
print(df.iloc[0].titles)
print()
print(df.iloc[0].summaries)

Survey on Semantic Stereo Matching / Semantic Depth Estimation



'Stereo matching is one of the widely used techniques for inferring depth from\nstereo images owing to its robustness and speed. It has become one of the major\ntopics of research since it finds its applications in autonomous driving,\nrobotic navigation, 3D reconstruction, and many other fields. Finding pixel\ncorrespondences in non-textured, occluded and reflective areas is the major\nchallenge in stereo matching. Recent developments have shown that semantic cues\nfrom image segmentation can be used to improve the results of stereo matching.\nMany deep neural network architectures have been proposed to leverage the\nadvantages of semantic segmentation in stereo matching. This paper aims to give\na comparison among the state of art networks both in terms of accuracy and in\nterms of speed which are of higher importance in real-time applications.'

## Dataset preparation

In [8]:
def preprocess_function(examples):
    text = ["# title:\n" + title + "\n# abstract:\n" + abstract for title, abstract in zip(examples["titles"], examples["summaries"])]
    return tokenizer(text, truncation=True)


In [9]:
tokenized_arxiv = dataset.map(preprocess_function, batched=True)


In [10]:
tokenized_arxiv

Dataset({
    features: ['titles', 'summaries', 'terms', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 51774
})

In [11]:
split_dataset = tokenized_arxiv.train_test_split(test_size=0.2, seed=42)

print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['titles', 'summaries', 'terms', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 41419
    })
    test: Dataset({
        features: ['titles', 'summaries', 'terms', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 10355
    })
})


In [12]:
type(split_dataset["train"]["labels"][:5][0][0])

float

In [13]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


## Pipeline preparation

In [14]:
import evaluate
import numpy as np

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def compute_metrics(eval_pred):

   predictions, labels = eval_pred
   predictions = (predictions > 0).astype(int).reshape(-1)
   return clf_metrics.compute(predictions=predictions, references=labels.astype(int).reshape(-1))


# Training

In [15]:

model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id,
    problem_type="multi_label_classification",
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
training_args = TrainingArguments(
    output_dir="~/.cache/huggingface/checkpoints/distilbert-arxiv",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="steps",
    save_strategy="epoch",
    report_to="all",
    run_name="DistillBertFinetuning_1",
    logging_steps=20,
    eval_on_start=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"].select(range(20 * 16)),
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [17]:

trainer.train()


wandb: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
wandb: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: No netrc file found, creating one.
wandb: Appending key for api.wandb.ai to your netrc file: C:\Users\amirf\_netrc
wandb: Currently logged in as: amirfvb to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,0.709293,0.27875,0.435421,0.278299,1.0
20,0.562300,0.460515,0.778125,0.672811,0.570312,0.820225
40,0.402100,0.386841,0.865,0.747664,0.778589,0.719101
60,0.352400,0.3173,0.89125,0.781955,0.883853,0.701124
80,0.303200,0.291802,0.89625,0.790404,0.902017,0.703371
100,0.270000,0.273445,0.898125,0.794969,0.902857,0.710112
120,0.268600,0.289325,0.88625,0.767857,0.887906,0.676404
140,0.251900,0.276573,0.903125,0.805031,0.914286,0.719101
160,0.288400,0.25582,0.905625,0.806658,0.9375,0.707865
180,0.251400,0.25059,0.90625,0.810127,0.927536,0.719101


KeyboardInterrupt: 

In [18]:
import wandb
wandb.finish()

wandb: Network error (ConnectionError), entering retry loop.
wandb: Network error (ConnectionError), entering retry loop.


0,1
eval/accuracy,▁▆▇█████████████████████████████████████
eval/f1,▁▅▆▇▇▇▇▇▇▇▇▇▇█▇█▇▇▇▇█▇▇█▇▇███████▇██████
eval/loss,█▅▃▂▂▂▂▂▂▂▂▁▁▂▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/precision,▁▄▇██████▇█▇████████▇█▇██████████▇██████
eval/recall,█▂▂▂▁▂▂▂▃▂▃▂▃▂▂▂▂▂▂▂▂▃▃▂▂▂▂▃▂▃▃▃▃▃▃▂▃▃▃▂
eval/runtime,▂▁▂▃▃▂▂▃▁▂▂▂▁▃▃▃▃▂▁▁▁▃▄▅█▆▂▃▃▃▃▂▄▃▂▂▃▂▄▅
eval/samples_per_second,▇█▇▆▆▇▇▆█▆▇▆▇█▆▇▆▇██▅▁▁▃▇▆▆▅▇▅▅▇▄▆▆▆▇▇▅▄
eval/steps_per_second,▇█▇▇▆▇▇▆█▆▇▆▆▇█▆▆▆█▆▇███▅▁▁▇▆▆▆▅▇▄▆▆▄▆▅▄
train/epoch,▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇▇███
train/global_step,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇█

0,1
eval/accuracy,0.91187
eval/f1,0.82657
eval/loss,0.21511
eval/precision,0.91304
eval/recall,0.75506
eval/runtime,2.6911
eval/samples_per_second,118.912
eval/steps_per_second,7.432
train/epoch,0.48667
train/global_step,1260.0


In [28]:
# Save model and tokenizer locally
# local_dir = "./distilbert-arxiv-checkpoint"
import os


local_dir = os.path.expanduser("~/.cache/huggingface/checkpoints/distilbert-arxiv")
model.save_pretrained(local_dir)
# tokenizer.save_pretrained(local_dir)

# # Push model and tokenizer to Hugging Face Hub
# model.push_to_hub("distilbert-arxiv-checkpoint")
# # tokenizer.push_to_hub("distilbert-arxiv-checkpoint")