In [19]:
from itertools import cycle
import random
names = ["Mekael", "Brendan", "Eve", "Jackson"]
random.shuffle(names)
circular_names = cycle(names)
next(circular_names)
circular_pairs = [(names[i], next(circular_names)) for i in range(len(names))]
print(circular_pairs)


[('Brendan', 'Jackson'), ('Jackson', 'Mekael'), ('Mekael', 'Eve'), ('Eve', 'Brendan')]


In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [242]:
torch.cuda.empty_cache()

In [None]:
# Dataset Configuration and Split
dataset = load_dataset("glue","stsb")
trainDataset = dataset["train"]
validationDataset = dataset["validation"]
testDataset = dataset["test"]

In [None]:
trainDataset[0]

In [215]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels = 1)
model = model.to(device)

text = "Didn't get here being careful"
encoded_input = tokenizer(text, return_tensors='pt').to(device)
output = model(**encoded_input)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenize Datasets Demo

In [None]:
def tokenizeFunction(data):
    return tokenizer(data['sentence1'],data['sentence2'], truncation=True, padding='max_length')

In [None]:
tokenizedTrainDataset = trainDataset.map(tokenizeFunction, batched = True)
tokenizedValidationDataset = validationDataset.map(tokenizeFunction, batched = True)
tokenizedTestDataset = testDataset.map(tokenizeFunction, batched = True)

In [None]:
tokenizedTrainDataset = tokenizedTrainDataset.map(lambda batch: {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask'], 'labels': batch['label']}, remove_columns=['sentence1', 'sentence2', 'label', 'idx'])
tokenizedValidationDataset = tokenizedValidationDataset.map(lambda batch: {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask'], 'labels': batch['label']}, remove_columns=['sentence1', 'sentence2', 'label', 'idx'])
tokenizedTestDataset = tokenizedTestDataset.map(lambda batch: {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask'], 'labels': batch['label']}, remove_columns=['sentence1', 'sentence2', 'label', 'idx'])

In [217]:
tokenizedTrainDataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 5749
})

In [218]:
tokenizedValidationDataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1500
})

In [227]:
import json
import os

checkpoint_dir = './results/checkpoint-7000'  # Replace with your checkpoint directory

trainer_state_path = os.path.join(checkpoint_dir, 'trainer_state.json')

with open(trainer_state_path, 'r') as f:
    trainer_state = json.load(f)

last_epoch = trainer_state['epoch']


In [228]:
# Configure Training Arguments
trainingArgs = TrainingArguments(
    output_dir= "./results",
    num_train_epochs= last_epoch + 6,
    per_device_eval_batch_size= 8,
    per_device_train_batch_size= 8,
    warmup_steps=100,
    weight_decay=0.01,
    evaluation_strategy= "epoch",
    logging_dir = "./logs",
    logging_steps=100,
    do_train = True,
    do_eval = True,
    learning_rate = 5e-5,
)

# Configure Trainer
trainer = Trainer(
    model=model,
    
    args=trainingArgs,
    train_dataset=tokenizedTrainDataset,
    eval_dataset= tokenizedValidationDataset,
    compute_metrics=lambda eval_preds: {"mse": ((eval_preds.predictions - eval_preds.label_ids) ** 2).mean()}
)

In [229]:
trainer.train()
trainer.evaluate()

  0%|          | 0/7814 [00:00<?, ?it/s]

{'loss': 0.2706, 'learning_rate': 5e-05, 'epoch': 0.14}
{'loss': 0.4034, 'learning_rate': 4.935182784547576e-05, 'epoch': 0.28}
{'loss': 0.349, 'learning_rate': 4.8703655690951524e-05, 'epoch': 0.42}
{'loss': 0.2987, 'learning_rate': 4.8055483536427274e-05, 'epoch': 0.56}
{'loss': 0.3105, 'learning_rate': 4.740731138190304e-05, 'epoch': 0.7}
{'loss': 0.3103, 'learning_rate': 4.6759139227378796e-05, 'epoch': 0.83}
{'loss': 0.2309, 'learning_rate': 4.611096707285455e-05, 'epoch': 0.97}


  0%|          | 0/188 [00:00<?, ?it/s]

{'eval_loss': 0.665998101234436, 'eval_mse': 4.747801303863525, 'eval_runtime': 14.614, 'eval_samples_per_second': 102.641, 'eval_steps_per_second': 12.864, 'epoch': 1.0}
{'loss': 0.2547, 'learning_rate': 4.546279491833031e-05, 'epoch': 1.11}
{'loss': 0.3169, 'learning_rate': 4.481462276380607e-05, 'epoch': 1.25}
{'loss': 0.236, 'learning_rate': 4.416645060928183e-05, 'epoch': 1.39}
{'loss': 0.2766, 'learning_rate': 4.351827845475759e-05, 'epoch': 1.53}
{'loss': 0.2689, 'learning_rate': 4.287010630023334e-05, 'epoch': 1.67}
{'loss': 0.2696, 'learning_rate': 4.2221934145709104e-05, 'epoch': 1.81}
{'loss': 0.2856, 'learning_rate': 4.157376199118486e-05, 'epoch': 1.95}


  0%|          | 0/188 [00:00<?, ?it/s]

{'eval_loss': 0.5439280867576599, 'eval_mse': 4.394655227661133, 'eval_runtime': 19.6004, 'eval_samples_per_second': 76.529, 'eval_steps_per_second': 9.592, 'epoch': 2.0}
{'loss': 0.234, 'learning_rate': 4.092558983666062e-05, 'epoch': 2.09}
{'loss': 0.2145, 'learning_rate': 4.0277417682136376e-05, 'epoch': 2.23}
{'loss': 0.2034, 'learning_rate': 3.9629245527612133e-05, 'epoch': 2.36}
{'loss': 0.2223, 'learning_rate': 3.89810733730879e-05, 'epoch': 2.5}
{'loss': 0.2042, 'learning_rate': 3.8332901218563655e-05, 'epoch': 2.64}
{'loss': 0.2487, 'learning_rate': 3.768472906403941e-05, 'epoch': 2.78}
{'loss': 0.2753, 'learning_rate': 3.703655690951517e-05, 'epoch': 2.92}


  0%|          | 0/188 [00:00<?, ?it/s]

{'eval_loss': 0.5000523924827576, 'eval_mse': 4.338993549346924, 'eval_runtime': 541.55, 'eval_samples_per_second': 2.77, 'eval_steps_per_second': 0.347, 'epoch': 3.0}
{'loss': 0.2404, 'learning_rate': 3.638838475499093e-05, 'epoch': 3.06}
{'loss': 0.199, 'learning_rate': 3.574021260046669e-05, 'epoch': 3.2}
{'loss': 0.2057, 'learning_rate': 3.509204044594244e-05, 'epoch': 3.34}
{'loss': 0.2292, 'learning_rate': 3.44438682914182e-05, 'epoch': 3.48}
{'loss': 0.2141, 'learning_rate': 3.379569613689396e-05, 'epoch': 3.62}
{'loss': 0.2163, 'learning_rate': 3.314752398236972e-05, 'epoch': 3.76}


KeyboardInterrupt: 

Inference

In [None]:
model_path = "./results/checkpoint-7000"
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained(model_path, num_labels = 1)


In [230]:
a = ["Counter: This node relates to counting or tallying various items or values, making it useful for inventory management or statistical queries.", 'Up Button: A sub-node of "Counter," it signifies an action of incrementing or moving upwards in numerical count.', 'Down Button: Another sub-node of "Counter," it denotes an action of decrementing or moving downwards in numerical count.']

inp = "Give me more"
print(inp)

for x in a:
    inputs = tokenizer(x,inp,return_tensors='pt').to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    prediction = logits.sigmoid()

    print(f'{prediction[0].item()} || {x}')

Give me more
0.7907435297966003 || Counter: This node relates to counting or tallying various items or values, making it useful for inventory management or statistical queries.
0.6874837279319763 || Up Button: A sub-node of "Counter," it signifies an action of incrementing or moving upwards in numerical count.
0.7884219884872437 || Down Button: Another sub-node of "Counter," it denotes an action of decrementing or moving downwards in numerical count.


In [None]:
results = trainer.predict(test_dataset=tokenizedTestDataset)

# Get predictions
test_predictions = trainer.predict(tokenizedTestDataset)
# Extract the predictions and labels from the result object
predictions = test_predictions.predictions
labels = test_predictions.label_ids

In [None]:
for x in test_predictions.predictions:
    print(x)

In [231]:
from scipy.stats import pearsonr

# Predictions
predictions = trainer.predict(tokenizedValidationDataset)
predicted_scores = predictions.predictions.flatten()

# True labels
true_scores = predictions.label_ids

# Pearson Correlation
corr, _ = pearsonr(true_scores, predicted_scores)

print(f'Pearson Correlation: {corr}')


  0%|          | 0/188 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(true_scores, predicted_scores)
print(f'Mean Squared Error: {mse}')


Train even more, let's overfit why not. Overfitting = Low Test Loss = Happy Investors

In [None]:
# Configure Training Arguments
trainingArgs = TrainingArguments(
    output_dir= "./results",
    num_train_epochs= 4,
    per_device_eval_batch_size= 4,
    per_device_train_batch_size= 4,
    evaluation_strategy= "epoch",
    logging_dir = "./logs",
    do_train = True,
    do_eval = True,
    learning_rate = 2e-5,
    warmup_ratio=0.1,
)

# Configure Trainer
trainer = Trainer(
    model=model,
    args=trainingArgs,
    train_dataset=tokenizedTrainDataset,
    eval_dataset= tokenizedValidationDataset,
    compute_metrics=lambda eval_preds: {"mse": ((eval_preds.predictions - eval_preds.label_ids) ** 2).mean()}
)

In [None]:
trainer.train()
trainer.evaluate()

In [245]:
from scipy.stats import pearsonr

# Predictions
predictions = trainer.predict(tokenizedValidationDataset)
predicted_scores = predictions.predictions.flatten()

# True labels
true_scores = predictions.label_ids

# Pearson Correlation
corr, _ = pearsonr(true_scores, predicted_scores)

print(f'Pearson Correlation: {corr}')




  0%|          | 0/188 [00:00<?, ?it/s]

Pearson Correlation: 0.884903959934462


In [246]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(true_scores, predicted_scores)
print(f'Mean Squared Error: {mse}')


Mean Squared Error: 0.5120729804039001
