In [64]:
import csv
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer

In [19]:
dataset = load_dataset("csv", data_files="data/claimsKGmini.csv")
dataset
    
    


Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'date', 'truthRating', 'ratingName', 'author', 'headline', 'named_entities_claim', 'named_entities_article', 'keywords', 'source', 'sourceURL', 'link', 'language'],
        num_rows: 797
    })
})

In [24]:
train_data = dataset["train"]
train_data[7]

{'id': 'http://data.gesis.org/claimskg/claim_review/34867b12-14f7-5b5d-a92b-0955d8c1456a',
 'text': 'Earth has not warmed for the last 17 years.',
 'date': '2014-03-11',
 'truthRating': 2,
 'ratingName': 'MIXTURE',
 'author': 'Unknown',
 'headline': "Climate change skeptic Patrick Moore says Earth has 'not warmed for the last 17 years'",
 'named_entities_claim': '2013 State of the Union,Aleutians,Amchitka Island,American nuclear weapons,Bob Beckel,Boulder,Climate Change,El Niño,Fox News,Global surface temperatures,Goddard Institute,Goddard Institute for Space Studies,Greenpeace,Greenpeace Canada,Hannity,Intergovernmental Panel on Climate Change,Kevin Trenberth,March 11,NASA,NOAA,National Center for Atmospheric Research,Patrick Moore,Phyllis Cormack,PolitiFact,President Barack Obama,PunditFact,Sean Hannity,United Nations,University of Maine,carbon emissions,cherry-picked,cherry-picking,climate change,climate change research,cold weather,combat climate change,degrees Celsius,global surfa

In [28]:
tokenizer = AutoTokenizer.from_pretrained("climatebert/environmental-claims")

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [35]:
sample_text = train_data[0]["text"]
print("Original text:", sample_text)
print("Tokenised:", tokenizer.tokenize(sample_text))

Original text: NASA admitted that climate change occurs naturally as a result of changes in Earth's solar orbit and not anthropogenic factors.
Tokenised: ['NASA', 'Ġadmitted', 'Ġthat', 'Ġclimate', 'Ġchange', 'Ġoccurs', 'Ġnaturally', 'Ġas', 'Ġa', 'Ġresult', 'Ġof', 'Ġchanges', 'Ġin', 'ĠEarth', "'s", 'Ġ', 'solar', 'Ġorbit', 'Ġand', 'Ġnot', 'Ġanthrop', 'ogenic', 'Ġ', 'factors', '.']


In [74]:
def custom_tokenize(examples):
    # The longest entry in the claimKGmini.csv is 65 words long however we don't know how many tokens this is...
    # Actually we know that the max is 76 so we can set this. But run this again when we add more data
    return tokenizer(examples["text"], max_length=100, padding="max_length", truncation=True)

tokenized_training_dataset = train_data.map(custom_tokenize, batched=True)

# Just for viewing purposes. Input_ids are the tokens, and attention_masks are whether they represent actual words or not.
# The max_length is set to 512 so every entry has been padded to be this long which seems unnecessary
print(tokenized_training_dataset[19]["text"])
print(tokenized_training_dataset[19]["input_ids"])
print(tokenized_training_dataset[19]["attention_mask"])

Map:   0%|          | 0/797 [00:00<?, ? examples/s]

The Obama administration’s own Environmental Protection Agency has said its Clean Power Plan will have a marginal impact on climate change.
[0, 133, 1284, 42220, 50416, 282, 50267, 29, 308, 6982, 5922, 3131, 34, 26, 63, 10326, 3029, 5427, 40, 33, 10, 14612, 913, 15, 2147, 464, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [None]:
# Code to find the max token length in the dataset
# Please reset max_length and padding parameters in above code cell before running this

lengths = [len(tokens) for tokens in tokenized_training_dataset["input_ids"]]
max = 0
for l in lengths:
    if l > max:
        max = l
max

76