In [1]:
from datasets import load_dataset

In [2]:
from transformers import LongformerTokenizer
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')

In [3]:
dataset  = load_dataset('hyperpartisan_news_detection', 'bypublisher')

Reusing dataset hyperpartisan_news_detection (/home/matteo/.cache/huggingface/datasets/hyperpartisan_news_detection/bypublisher/1.0.0/60aa536d5067f21aacb9ab08b94548649fd241c1e3cf6bb643d0a4a1b20bcf25)


In [4]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'title', 'hyperpartisan', 'url', 'published_at', 'bias'],
        num_rows: 600000
    })
    validation: Dataset({
        features: ['text', 'title', 'hyperpartisan', 'url', 'published_at', 'bias'],
        num_rows: 600000
    })
})


In [10]:
type(dataset)


import re
import bleach
def clean_text_(text, label):
    text = bleach.clean(text,strip=True)
    text = text.replace('<p>', '')
    text = text.replace('</p>', '')
    text = text.replace('\n', '')
    text = text.replace('&amp;#160;', '')
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    if str(label) == 'True':
        new_label = 1
    else:
        new_label = 0
    return text, new_label

In [14]:
def clean_text(text):
    text = bleach.clean(text,strip=True)
    text = text.replace('<p>', '')
    text = text.replace('</p>', '')
    text = text.replace('\n', '')
    text = text.replace('&amp;#160;', '')
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    return text

def conv_label(label):
    if str(label) == 'True':
        return 1
    else:
        return 0

In [12]:
import torch

def convert_to_features_(example):
    # Tokenize contexts and questions (as pairs of inputs)
    text_, target_ = clean_text(example['text'], example['hyperpartisan'])
    encodings = tokenizer.encode_plus(text_, pad_to_max_length=True, max_length=2048,
                                           add_special_tokens=True,
                                            return_token_type_ids=False,
                                            return_attention_mask=True,
                                            padding='max_length', truncation=True,
                                           )
    targets = torch.tensor(target_,dtype=torch.long)


    encodings.update({'targets': targets,
                      'attention_mask': encodings['attention_mask']})
    return encodings

In [16]:
def convert_to_features(data):
    return tokenizer(clean_text(data['text']),  padding = 'max_length', truncation=True, max_length = 1024)

In [7]:
## Take subset of data
train_size = 60
val_size = 26
import numpy as np
train_indices = np.random.randint(0, len(dataset['train']), train_size)
val_indices = np.random.randint(0, len(dataset['validation']), val_size)
train_dataset = dataset['train'].select(train_indices)
val_dataset = dataset['validation'].select(val_indices)


In [8]:
type(train_dataset)

datasets.arrow_dataset.Dataset

In [22]:
train_dataset['text']

['<p /> \n\n<p>U.S. stocks slipped Thursday as oil prices fell and investors sold safe haven assets they had accumulated after the U.K.\'s decision to leave the European Union.</p> \n\n<p>Continue Reading Below</p> \n\n<p>Investors shed assets that are often perceived as less-risky, such as high-dividend stocks and precious metals. But at midday, a decline in the price of oil led markets lower.</p> \n\n<p>The Dow Jones Industrial Average dropped 23 points, or 0.1%, to 17896 and the S&amp;amp;P 500 dropped 0.1%. The tech-heavy Nasdaq Composite rose 0.4%.</p> \n\n<p>U.S.-traded crude oil for August delivery fell 4.8% to $45.14 a barrel as data showed inventories fell less than expected and investors worried that global demand was softening. Stocks traded slightly higher Thursday morning before switching direction around noon, after the oil inventory numbers were released.</p> \n\n<p>"Crude oil took everything lower," said Ilya Feygin, managing director at brokerage firm WallachBeth. "Peo

In [None]:
train_dataset =  train_dataset.map(convert_to_features, batched= True, batch_size=len(train_dataset))
val_dataset =  val_dataset.map(convert_to_features, load_from_cache_file=False)

In [None]:
columns = ['input_ids', 'attention_mask', 'targets']
train_dataset.set_format(type='torch', columns=columns)
val_dataset.set_format(type='torch', columns=columns)