In [2]:
from model import Model
from readers import read_ozrock, convert_to_dataset, get_label_list
import torch
from datasets import Dataset, DatasetDict
from transformers import RobertaForTokenClassification, RobertaTokenizerFast, TrainingArguments

In [3]:
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Switching to GPU...")
    device = torch.device('cuda')
print("Current device index:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

CUDA available: True
Switching to GPU...
Current device index: 0
Device name: NVIDIA GeForce RTX 4070 Ti


In [None]:
datasets, label_list, label2id, id2label = read_ozrock('./OzRock/AutoLabelledSet.txt', './OzRock/EvaluationSet.txt')

In [6]:
# Load tokenizer and model
model_name = 'roberta-large'
tokenizer = RobertaTokenizerFast.from_pretrained(model_name, add_prefix_space=True)
model = RobertaForTokenClassification.from_pretrained(model_name, num_labels=len(label_list), label2id=label2id, id2label=id2label)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
training_args = TrainingArguments(
    output_dir="./results",
    logging_dir="./logs",
    eval_strategy="steps",
    save_strategy="steps",
    logging_steps=100,
    num_train_epochs=1,
    save_total_limit=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

model_instance = Model(
    device=device,
    model_name=model_name,
    tokenizer=tokenizer,
    model=model,
    label_list=label_list
)

model_instance.set_training_args(training_args)
tokenized_datasets = model_instance.tokenize_datasets(datasets)

Tokenizing datasets...


Map:   0%|          | 0/31942 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [8]:
model_instance.train(tokenized_datasets)
model_instance.trainer.save_model("./models/"+model_name+"-model")
model_instance.tokenizer.save_pretrained("./models/"+model_name+"-model")

  trainer = Trainer(


Starting training...


Step,Training Loss,Validation Loss,Precision,Recall,F1
100,0.4074,0.277269,0.473277,0.570566,0.517387
200,0.1858,0.219719,0.628039,0.647813,0.637773
300,0.1354,0.201651,0.698138,0.686253,0.692144
400,0.1243,0.239374,0.638175,0.709317,0.671868
500,0.1142,0.202014,0.719644,0.75508,0.736936
600,0.094,0.221036,0.732916,0.728354,0.730628
700,0.0807,0.222236,0.716712,0.772469,0.743547
800,0.0748,0.220785,0.73393,0.764964,0.749126
900,0.0803,0.21161,0.702851,0.771554,0.735602
1000,0.0612,0.204609,0.718767,0.742449,0.730416


('./models/roberta-large-model\\tokenizer_config.json',
 './models/roberta-large-model\\special_tokens_map.json',
 './models/roberta-large-model\\vocab.json',
 './models/roberta-large-model\\merges.txt',
 './models/roberta-large-model\\added_tokens.json',
 './models/roberta-large-model\\tokenizer.json')

In [9]:
model_instance.trainer.evaluate(tokenized_datasets["eval"])

{'eval_loss': 0.227824866771698,
 'eval_precision': 0.7506575486586007,
 'eval_recall': 0.7836353651839648,
 'eval_f1': 0.7667920472864052,
 'eval_runtime': 9.9803,
 'eval_samples_per_second': 200.395,
 'eval_steps_per_second': 12.525,
 'epoch': 1.0}

In [11]:
sample = "The Bogada Bore project sits at the northern end of the north-northwest trending Yandal Greenstone belt in the north-eastern Goldfields region of the Archaean Yilgarn Craton. The regional geology of the Yandal belt and the greenstone stratigraphy in the Jundee Mining area is documented by Kohler & Phillips (2003), Wyche & Farrell (2000) and Vearncombe et al (2000) (Figure 3)."

inputs = tokenizer(
    sample,
    return_tensors="pt",
    truncation=True,
    padding=True,
    is_split_into_words=False  # Only use True if you're passing tokenized input
)

inputs = {k: v.to(model.device) for k, v in inputs.items()}
with torch.no_grad():
    outputs = model(**inputs)

predictions = outputs.logits.argmax(dim=-1).squeeze().tolist()



In [16]:
predicted_labels = [id2label[id] for id in predictions]

tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

for token, label in zip(tokens, predicted_labels):
    print(f"{token:<15} {label}")


<s>             O
ĠThe            O
ĠBog            O
ada             O
ĠBore           O
Ġproject        O
Ġsits           O
Ġat             O
Ġthe            O
Ġnorthern       O
Ġend            O
Ġof             O
Ġthe            O
Ġnorth          O
-               O
north           O
west            O
Ġtrending       O
ĠY              O
andal           O
ĠGreen          B-STRAT
stone           I-STRAT
Ġbelt           I-STRAT
Ġin             O
Ġthe            O
Ġnorth          O
-               O
e               O
astern          O
ĠGold           I-LOCATION
fields          I-LOCATION
Ġregion         O
Ġof             O
Ġthe            O
ĠArchae         B-TIMESCALE
an              B-TIMESCALE
ĠY              B-LOCATION
il              I-LOCATION
g               I-LOCATION
arn             I-LOCATION
ĠCr             I-LOCATION
aton            I-LOCATION
.               O
ĠThe            O
Ġregional       O
Ġge             O
ology           O
Ġof             O
Ġthe            O
ĠY      

In [13]:
id2label

{0: 'B-LOCATION',
 1: 'B-MINERAL',
 2: 'B-ORE_DEPOSIT',
 3: 'B-ROCK',
 4: 'B-STRAT',
 5: 'B-TIMESCALE',
 6: 'I-LOCATION',
 7: 'I-MINERAL',
 8: 'I-ORE_DEPOSIT',
 9: 'I-ROCK',
 10: 'I-STRAT',
 11: 'I-TIMESCALE',
 12: 'O'}