In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertModel, AdamW
import torch.nn as nn
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
from scipy.stats import expon, loguniform, uniform
import os
import torch.nn.functional as F

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_data_from_csv(file_path):
    dataframe = pd.read_csv(file_path)
    return dataframe

cwd = os.getcwd()
file_path = os.path.join(cwd, r"data\initial_training_data\new_data.csv")
dataset = load_data_from_csv(file_path)
dataset

Unnamed: 0,text,AwT score,SoE score,ID
0,Nutritional status and gene polymorphisms of o...,0.9,0.2,PMC9569987
1,Thrombophilic gene polymorphism is known to be...,0.9,0.5,PMC6045916
2,Background Whether adiponectin (ADIPOQ) polymo...,0.95,1.0,PMC6278103
3,"Polycystic ovary syndrome (PCOS) is a common, ...",0.95,1.0,PMC4557132
4,Objective: Endometriosis has been considered a...,0.95,0.55,21429654
5,Objective To present the development of the fi...,0.95,0.55,PMC7169920
6,Introduction: The aim of the study was to eval...,0.95,0.6,28819944
7,Background: Although the precise pathophysiolo...,0.95,1.0,18277167
8,Background Key reactions in folate-mediated si...,0.95,0.4,PMC8792379
9,Epidemiological studies have suggested that th...,0.95,0.6,PMID: 25102261


In [3]:
def randomize_scores(scores, max_deviation= 0.05):
    randomized_scores = scores * (1 + np.random.uniform(-max_deviation,max_deviation,size=scores.shape))
    return np.clip(randomized_scores,0,1)

train_y = dataset[["AwT score", "SoE score"]].values

In [4]:
def penalty_function_AwT(AwT, alpha=2, epsilon=1e-6):
    return torch.exp(-(AwT + epsilon)**alpha)

def reward_function_AwT(AwT, beta=2):
    return (torch.exp(AwT - 0.5))**beta

def penalty_function_SoE(SoE, gamma=2, epsilon=1e-6):
    return torch.exp(-(SoE + epsilon)**gamma)

def reward_function_SoE(SoE, delta=2):
    return (torch.exp(SoE - 0.5))**delta

def calculate_final_score(SoE, AwT, w_SoE=0.4, w_AwT=0.6, alpha=2, beta=2, gamma=2, delta=2):
    base_score = w_SoE * SoE + w_AwT * AwT

    AwT_score = torch.where(AwT < 0.5, base_score * penalty_function_AwT(AwT, alpha), base_score * reward_function_AwT(AwT, beta))
    final_score = torch.where(SoE < 0.5, AwT_score * penalty_function_SoE(SoE, gamma), AwT_score * reward_function_SoE(SoE, delta))
    
    return final_score

def custom_scorer(y_true, y_pred):
    scores = []
    for true, pred in zip(y_true, y_pred):
        score = calculate_final_score(true[1], true[0])  # Assuming y_true contains [AwT, SoE]
        scores.append(score)
    return mean_squared_error(y_true, scores)

def evaluate_hyperparameters(alpha, beta, gamma, delta):
    def model_evaluation(SoE, AwT):
        return calculate_final_score(SoE, AwT, alpha=alpha, beta=beta, gamma=gamma, delta=delta)
    return model_evaluation

In [5]:
# Load your dataset
# dataset = pd.read_csv("path_to_your_dataset.csv")
# Assuming 'text' column contains abstracts and 'score' column contains labels

def custom_loss(predictions, targets, w_SoE=0.4, w_AwT=0.6, alpha=2, beta=2, gamma=2, delta=2):
    mse_loss = F.mse_loss(predictions, targets)
    
    AwT = targets[:, 0]
    SoE = targets[:, 1]
    final_scores = calculate_final_score(SoE, AwT, w_SoE, w_AwT, alpha, beta, gamma, delta)
    
    reward_punishment_term = torch.tensor(final_scores, dtype=torch.float32, device=predictions.device)
    
    # Integrate the reward/punishment term with the MSE loss
    total_loss = mse_loss - reward_punishment_term.mean()
    
    return total_loss

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
scores_to_randomize = dataset[["AwT score", "SoE score"]].values
randomized_scores = randomize_scores(scores_to_randomize)
dataset[["AwT score", "SoE score"]] = randomized_scores
train_data, test_data = train_test_split(dataset, test_size=0.4, random_state=42)

# Initializing tokenizer
tokenizer = BertTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")

# Function to prepare DataLoader
def prepare_dataloader(data, batch_size=6, test=False):
    inputs = tokenizer(data["text"].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")
    labels = torch.tensor(data[["AwT score", "SoE score"]].values).float()
    dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    return dataloader

train_dataloader = prepare_dataloader(train_data,batch_size=15)

class BertForRegression(nn.Module):
    def __init__(self, model_name, hidden_size=768):
        super(BertForRegression, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.regressor = nn.Linear(hidden_size, 2)

    def forward(self, input_ids, attention_mask, token_type_ids=None, return_embeddings=False):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        if return_embeddings:
            return pooled_output
        return self.regressor(pooled_output)
    
def save_model(model, path):
    torch.save(model.state_dict(), path)

def train_model(train_dataloader, device, epochs = 4, model_index = 0):
    model = BertForRegression("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=2e-5) # test value  # torch.optim.AdamW
    #criterion = nn.MSELoss()

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_dataloader:
            b_input_ids, b_input_mask, b_labels = [item.to(device) for item in batch] #co to jest b_input_ids
            optimizer.zero_grad()
            outputs = model(b_input_ids,b_input_mask) #co to jest????
            #loss = criterion(outputs.squeeze(),b_labels)
            loss = custom_loss(outputs, b_labels)  # Use the custom loss function #outputs = predictions , b_labels = targets 
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
        avg_train_loss = total_loss / len(train_dataloader)
        model_index += 1
        print(f"Epoch {epoch + 1}, Loss: {avg_train_loss}")
        save_model(model, f"trained_model_{model_index}.pt")
    
    return model

num_models = 2
models = [train_model(train_dataloader, device) for _ in range(num_models)]



def evaluate_models(models, test_dataloader, device):
    all_predictions = []
    true_scores = []
    for model in models:
        model.eval()
        predictions = []
        with torch.no_grad():
            for batch in test_dataloader:
                b_input_ids, b_input_mask, b_labels = [item.to(device) for item in batch]
                outputs = model(b_input_ids, b_input_mask)
                predictions.extend(outputs.squeeze().cpu().numpy())
                true_scores.extend(b_labels.cpu().numpy())
        all_predictions.append(predictions)
    avg_predictions = np.mean(all_predictions, axis=0)
    true_scores = np.array(true_scores)
    # Calculate custom scores
    SoE_scores = true_scores[:, 1]
    AwT_scores = true_scores[:, 0]
    final_scores = []
    for i in range(len(avg_predictions)):
        final_score = calculate_final_score(avg_predictions[i][1], avg_predictions[i][0])
        final_scores.append(final_score)
    
    final_scores = np.array(final_scores)
    return avg_predictions, all_predictions, final_scores

test_dataloader = prepare_dataloader(test_data, batch_size=15, test=True) # Use your actual test data here


  reward_punishment_term = torch.tensor(final_scores, dtype=torch.float32, device=predictions.device)


Epoch 1, Loss: -2.6905455589294434
Epoch 2, Loss: -3.08303165435791
Epoch 3, Loss: -3.2622413635253906
Epoch 4, Loss: -3.287731885910034
Epoch 1, Loss: -2.8857524394989014
Epoch 2, Loss: -3.187039852142334
Epoch 3, Loss: -3.268559455871582
Epoch 4, Loss: -3.2718114852905273


In [8]:
def evaluate_models(models, test_dataloader, device):
    all_predictions = []
    true_scores = []

    for model in models:
        model.eval()
        predictions = []
        with torch.no_grad():
            for batch in test_dataloader:
                b_input_ids, b_input_mask, b_labels = [item.to(device) for item in batch]
                outputs = model(b_input_ids, b_input_mask)
                predictions.extend(outputs.cpu().numpy())
                true_scores.extend(b_labels.cpu().numpy())
        all_predictions.append(predictions)

    avg_predictions = np.mean(all_predictions, axis=0)
    true_scores = np.array(true_scores)

    # Calculate custom scores
    SoE_scores = true_scores[:, 1]
    AwT_scores = true_scores[:, 0]
    
    final_scores = calculate_final_score(
        torch.tensor(SoE_scores, device=device),
        torch.tensor(AwT_scores, device=device),
        w_SoE=0.4, w_AwT=0.6, alpha=2, beta=2, gamma=2, delta=2
    ).cpu().numpy()

    return avg_predictions, all_predictions, final_scores

In [6]:
param_distributions = {
    'alpha' : [expon(scale=1.0), uniform(0.1,1.9)],
    'beta' : [loguniform(1e-3,1e1),uniform(0.1,1.9)],
    'gamma' : [expon(scale=1.0),uniform(0.1,1.9)],
    'delta' : [uniform(0.1,1.9), expon(scale=1.0)]
}

random_search = RandomizedSearchCV(
    estimator=evaluate_hyperparameters,
    param_distributions=param_distributions,
    n_iter=100,
    scoring=make_scorer(custom_scorer, greater_is_better=True),
    cv=5,
    random_state=42
)

x_train = dataset[["AwT score", "SoE score"]].values
y_train = dataset[["AwT score", "SoE score"]].values

random_search.fit(x_train, y_train)
best_params = random_search.best_params_
print(f"Best parameters: {best_params}")

KeyError: "None of [Index(['AwT score', 'SoE score'], dtype='object')] are in the [columns]"

In [9]:
avg_predictions, all_predictions, final_score = evaluate_models(models, test_dataloader, device)
# Assuming true_scores is already defined correctly
true_scores = dataset.drop(test_data.index)
train_scores_true = dataset.drop(true_scores.index)
train_scores_true = train_scores_true.iloc[:,1:3]
print(train_scores_true)
#Check the shape of avg_predictions
print(f"Shape of avg_predictions: {avg_predictions.shape}")
print(f"Shape of true_scores: {train_scores_true.shape}")

mse = ((avg_predictions - train_scores_true) ** 2).mean(axis=0)
print(f"Average MSE: {mse}")





    AwT score  SoE score
0    0.905837   0.204037
5    0.975222   0.538041
8    0.917598   0.415331
9    0.928707   0.580388
11   0.961822   1.000000
13   0.737319   0.541878
Shape of avg_predictions: (6, 2)
Shape of true_scores: (6, 2)
Average MSE: AwT score    0.032119
SoE score    0.125733
dtype: float64


array([[0.98249406, 0.8352511 ],
       [1.0636257 , 0.81183314],
       [1.0792396 , 0.7447859 ],
       [1.0759197 , 0.7246385 ],
       [1.063615  , 0.7327546 ],
       [1.0849732 , 0.825143  ]], dtype=float32)