In [1]:
from datasets import load_dataset

In [2]:
from transformers import LongformerTokenizer
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')

In [3]:
dataset  = load_dataset('hyperpartisan_news_detection', 'bypublisher')

Reusing dataset hyperpartisan_news_detection (/home/matteo/.cache/huggingface/datasets/hyperpartisan_news_detection/bypublisher/1.0.0/60aa536d5067f21aacb9ab08b94548649fd241c1e3cf6bb643d0a4a1b20bcf25)


In [4]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'title', 'hyperpartisan', 'url', 'published_at', 'bias'],
        num_rows: 600000
    })
    validation: Dataset({
        features: ['text', 'title', 'hyperpartisan', 'url', 'published_at', 'bias'],
        num_rows: 600000
    })
})


In [5]:
type(dataset)

datasets.dataset_dict.DatasetDict

In [6]:
print("Size of train dataset: ", dataset['train'].shape)
dataset['train']

Size of train dataset:  (600000, 6)


Dataset({
    features: ['text', 'title', 'hyperpartisan', 'url', 'published_at', 'bias'],
    num_rows: 600000
})

In [7]:
position = 1
print(" Example of text: ", dataset['train'][position]['text'])
print(" Length of text: ", len(dataset['train'][position]['text'].split(" ")))

print(" Class of above text: ",  dataset['train'][position]['hyperpartisan'])

 Example of text:  <p>A Florida university will honor Trayvon Martin with a posthumous Bachelor of Science Degree in Aviation at a May 13 commencement ceremony. The degree will be accepted by his parents, Tracy Martin and Sabrina Fulton.</p> 

<p>Seventeen-year-old Martin was killed in February of 2012 by former neighborhood watch captain George Zimmerman, who is Hispanic, in self-defense. After a long, racially-charged public debate and court trial, Zimmerman was completely exonerated in July of 2013.</p> 

<p>Florida Memorial University officially <a href="https://www.facebook.com/FLMemorialUniv/" type="external">announced</a>the posthumous degree via Facebook on Wednesday.</p> 

<p>"The University will confer upon TRAYVON MARTIN a posthumous degree in Aeronautical Science with a concentration in Flight Education, in honor of the steps he took during his young life toward becoming a pilot," says the Facebook post. "This particular Bachelor&#8217;s degree is designated for those study

In [8]:
import re
import bleach
def clean_text(text, label):
    text = bleach.clean(text,strip=True)
    text = text.replace('<p>', '')
    text = text.replace('</p>', '')
    text = text.replace('\n', '')
    text = text.replace('&amp;#160;', '')
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    if str(label) == 'True':
        new_label = 1
    else:
        new_label = 0
    return text, new_label

In [9]:
import torch

def convert_to_features(example):
    # Tokenize contexts and questions (as pairs of inputs)
    text_, label_ = clean_text(example['text'], example['hyperpartisan'])
    encodings = tokenizer.encode_plus(text_, pad_to_max_length=True, max_length=2048,
                                           add_special_tokens=True,
                                            return_token_type_ids=False,
                                            return_attention_mask=True,
                                            padding='max_length', truncation=True,
                                           )
    labels = torch.tensor(label_,dtype=torch.long)


    encodings.update({'label': labels,
                      'attention_mask': encodings['attention_mask']})
    return encodings

In [10]:
## Take subset of data
train_size = 40
val_size = 32
import numpy as np
train_indices = np.random.randint(0, len(dataset['train']), train_size)
val_indices = np.random.randint(0, len(dataset['validation']), val_size)
train_dataset = dataset['train'].select(train_indices)
val_dataset = dataset['validation'].select(val_indices)

In [11]:
train_dataset =  train_dataset.map(convert_to_features, load_from_cache_file=False)
val_dataset =  val_dataset.map(convert_to_features, load_from_cache_file=False)

HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))





HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

In [12]:
columns = ['input_ids', 'attention_mask', 'targets']
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


In [13]:
type(train_dataset )
torch.save(train_dataset, 'train_data.pt')
torch.save(val_dataset, 'valid_data.pt')
train_dataset = torch.load('train_data.pt')
val_dataset = torch.load('valid_data.pt')

In [14]:
print(len(train_dataset[10]['input_ids']))
print(train_dataset[10])

2048
{'attention_mask': tensor([1, 1, 1,  ..., 0, 0, 0]), 'input_ids': tensor([    0,   211, 29765,  ...,     1,     1,     1]), 'label': tensor(1)}


In [15]:
#decode
tokenizer.decode(train_dataset[10]['input_ids'])

'<s> Dittoheads and <a href="">Fox News</a> watchers are understandably wary of any <a href="">public healthcare option</a>, given the <a href="">misinformation</a> shoved down their throats on a daily basis. (President Obama tries to dispel some of the myths <a href="http://www.youtube.com/watch?v=2U0vY-_SvLM&amp;amp;feature=fvw">here</a>.) Part of people&#8217;s fear, as explained by The New Yorker&#8216;s <a href="http://www.newyorker.com/talk/financial/2009/08/31/090831ta_talk_surowiecki">James Surowiecki</a> in the latest issue, can be explained by our innate tendency to assign an irrationally high value to something already in our possession&#8212;like our often crappy and expensive health insurance plans. But people really need to reflect on this stuff and not let fear and misinformation win out. Consider: What if you have a health problem that&#8217;s covered through your employer, but you want to switch jobs? Will your new employer&#8217;s insurance plan accept you? What if yo

In [16]:
from transformers import LongformerForSequenceClassification
model = LongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096', gradient_checkpointing=True,
                                                           attention_window = 512)

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', '

In [17]:
from transformers import Trainer, TrainingArguments

In [18]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [19]:
# define accuracy metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # argmax(pred.predictions, axis=1)
    #pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [20]:
token_len = 1024

In [21]:
# define the training arguments
training_args = TrainingArguments(
    output_dir = "./output1",
    num_train_epochs = 25,
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 1,
    per_device_eval_batch_size= 1,
    learning_rate=2e-5,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    evaluation_strategy = "epoch",
    disable_tqdm = False,
    load_best_model_at_end=True,
    warmup_steps=200,
    logging_steps = 4,
    fp16 = True,
    logging_dir="./logs1",
    dataloader_num_workers = 0,
    run_name = 'longformer-classification-hyperpartisan'
)

In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [23]:
trainer.train()



Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:

history_val = trainer.evaluate()

In [None]:
train_dataset
