In [2]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertModel, AdamW
import torch.nn as nn
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
from scipy.stats import expon, loguniform, uniform
import os

In [5]:
def load_data_from_csv(file_path):
    dataframe = pd.read_csv(file_path)
    return dataframe

cwd = os.getcwd()
file_path = os.path.join(cwd, r"data\initial_training_data\new_data.csv")
dataset = load_data_from_csv(file_path)
dataset


Unnamed: 0,text,AwT score,SoE score,ID
0,Nutritional status and gene polymorphisms of o...,0.9,0.2,PMC9569987
1,Thrombophilic gene polymorphism is known to be...,0.9,0.5,PMC6045916
2,Background Whether adiponectin (ADIPOQ) polymo...,0.95,1.0,PMC6278103
3,"Polycystic ovary syndrome (PCOS) is a common, ...",0.95,1.0,PMC4557132
4,Objective: Endometriosis has been considered a...,0.95,0.55,21429654
5,Objective To present the development of the fi...,0.95,0.55,PMC7169920
6,Introduction: The aim of the study was to eval...,0.95,0.6,28819944
7,Background: Although the precise pathophysiolo...,0.95,1.0,18277167
8,Background Key reactions in folate-mediated si...,0.95,0.4,PMC8792379
9,Epidemiological studies have suggested that th...,0.95,0.6,PMID: 25102261


In [13]:
def randomize_scores(scores, max_deviation= 0.05):
    randomized_scores = scores * (1 + np.random.uniform(-max_deviation,max_deviation,size=scores.shape))
    return np.clip(randomized_scores,0,1)

train_x = dataset["text"].tolist()
train_y = dataset[[" AwT score", " SoE score"]].values

[[0.9  0.2 ]
 [0.9  0.5 ]
 [0.95 1.  ]
 [0.95 1.  ]
 [0.95 0.55]
 [0.95 0.55]
 [0.95 0.6 ]
 [0.95 1.  ]
 [0.95 0.4 ]
 [0.95 0.6 ]
 [0.95 0.55]
 [0.95 1.  ]
 [0.75 0.4 ]
 [0.75 0.55]
 [0.85 0.5 ]]
[[0.9173911  0.20410323]
 [0.89328658 0.50586134]
 [0.96121359 1.        ]
 [0.99524756 1.        ]
 [0.93993131 0.56981727]
 [0.91885347 0.56277266]
 [0.93643383 0.60526664]
 [0.92387992 1.        ]
 [0.98821685 0.3994998 ]
 [0.94048028 0.60594862]
 [0.9102192  0.52372851]
 [0.97351655 0.99271368]
 [0.72334145 0.41483661]
 [0.77450697 0.54330418]
 [0.86813289 0.50324535]]


In [121]:


#This will be a separate module
def penalty_function_AwT(AwT, alpha = 1, epsilon = 1e-6):
    if AwT < 0.5:
        return 1 / (AwT+epsilon)**alpha
    else:
        return 1

def reward_function_AwT(AwT, beta = 1):
    if AwT > 0.5:
        return (np.exp(AwT - 0.5))**beta
    else:
        return 1

def penalty_function_SoE(SoE, gamma=0.5, epsilon=1e-6):
    if SoE < 0.5:
        return 1 / (SoE + epsilon)**gamma
    else:
        return 1
    
def reward_function_SoE(SoE, delta=0.5):
    if SoE > 0.5:
        return (np.exp(SoE - 0.5))**delta
    else:
        return 1

def calculate_final_score(SoE, AwT, w_SoE=0.4, w_AwT=0.6, alpha=1, beta=1, gamma=0.5, delta=0.5):
    base_score = w_SoE * SoE + w_AwT * AwT
    
    if AwT < 0.5:
        AwT_score = base_score * penalty_function_AwT(AwT, alpha)
    else:
        AwT_score = base_score * reward_function_AwT(AwT, beta)
    
    if SoE < 0.5:
        final_score = AwT_score * penalty_function_SoE(SoE, gamma)
    else:
        final_score = AwT_score * reward_function_SoE(SoE, delta)
    
    return final_score

def custom_scorer(y_true, y_pred):
    return mean_squared_error(y_true, y_pred)

def evaluate_hyperparameters(alpha, beta, gamma, delta):
    def model_evaluation(SoE, AwT):
        return calculate_final_score(SoE, AwT, alpha=alpha, beta=beta, gamma=gamma, delta=delta)
    return model_evaluation


print(calculate_final_score(SoE=0.1,AwT=0.1))
print(calculate_final_score(SoE=1,AwT=1))
print("not what we wanted")

3.1622302265963973
2.117000016612675


In [119]:
def penalty_function_AwT(AwT, alpha=2, epsilon=1e-6):
    return np.exp(-(AwT + epsilon)**alpha)

def reward_function_AwT(AwT, beta=2):
    return (np.exp(AwT - 0.5))**beta

def penalty_function_SoE(SoE, gamma=2, epsilon=1e-6):
    return np.exp(-(SoE + epsilon)**gamma)

def reward_function_SoE(SoE, delta=2):
    return (np.exp(SoE - 0.5))**delta

def calculate_final_score(SoE, AwT, w_SoE=0.4, w_AwT=0.6, alpha=2, beta=2, gamma=2, delta=2):
    base_score = w_SoE * SoE + w_AwT * AwT
    
    if AwT < 0.5:
        AwT_score = base_score * penalty_function_AwT(AwT, alpha)
    else:
        AwT_score = base_score * reward_function_AwT(AwT, beta)
    
    if SoE < 0.5:
        final_score = AwT_score * penalty_function_SoE(SoE, gamma)
    else:
        final_score = AwT_score * reward_function_SoE(SoE, delta)
    
    return final_score

score00 = calculate_final_score(SoE=0,AwT=0)
score01 = calculate_final_score(SoE=0.1,AwT=0.1)
score02 = calculate_final_score(SoE=0.2,AwT=0.2)
score03 = calculate_final_score(SoE=0.3,AwT=0.3)
score04 = calculate_final_score(SoE=0.4,AwT=0.4)
score049 = calculate_final_score(SoE=0.49,AwT=0.49)
score05 = calculate_final_score(SoE=0.5,AwT=0.5)
score051 = calculate_final_score(SoE=0.51,AwT=0.51)
score06 = calculate_final_score(SoE=0.2,AwT=0.2)
score07 = calculate_final_score(SoE=0.3,AwT=0.3)
score08 = calculate_final_score(SoE=0.4,AwT=0.4)
score09 = calculate_final_score(SoE=0.5,AwT=0.5)
score1 = calculate_final_score(SoE=1,AwT=1)

print(score00)
print(score04 - score03)
print(score03 - score02)
print(score02 - score01)
print("the closer to 0 the stronger penalty")
print(score049, score05, score051, 'huge drop') 
print(score08 - score07)
print(score09 - score08)
print("the closer to 1 the stronger reward")
print(score1)


0.0
0.039878387368098844
0.06595764114738317
0.08660329345586115
the closer to 0 the stronger penalty
/n 0.3031426331155356 0.5 0.5308134948381182 /n
0.039878387368098844
0.20954084990611643
the closer to 1 the stronger reward
7.389056098930652


In [62]:
# Load your dataset
# dataset = pd.read_csv("path_to_your_dataset.csv")
# Assuming 'text' column contains abstracts and 'score' column contains labels

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')



# Initializing tokenizer
tokenizer = BertTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")

# Function to prepare DataLoader
def prepare_dataloader(data, batch_size=6, test=False):
    inputs = tokenizer(data["text"].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")
    if test:
        randomized_train_y = randomize_scores(scores=train_y)
        labels = torch.tensor(randomized_train_y).float()
    labels = torch.tensor(data[[" AwT score", " SoE score"]].values).float()
    dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return dataloader

train_dataloader = prepare_dataloader(dataset)




class BertForRegression(nn.Module):
    def __init__(self, model_name, hidden_size=768):
        super(BertForRegression, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.regressor = nn.Linear(hidden_size, 2)

    def forward(self, input_ids, attention_mask, token_type_ids=None, return_embeddings=False):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        if return_embeddings:
            return pooled_output
        return self.regressor(pooled_output)
    


def train_model(train_dataloader, device, epochs = 8):
    model = BertForRegression("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=2e-5) # test value  # torch.optim.AdamW
    criterion = nn.MSELoss()

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_dataloader:
            b_input_ids, b_input_mask, b_labels = [item.to(device) for item in batch]
            optimizer.zero_grad()
            outputs = model(b_input_ids,b_input_mask)
            loss = criterion(outputs.squeeze(),b_labels)
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
        avg_train_loss = total_loss / len(train_dataloader)

        print(f"Epoch {epoch + 1}, Loss: {avg_train_loss}")

    return model

num_models = 2
models = [train_model(train_dataloader, device) for _ in range(num_models)]

def evaluate_models(models, test_dataloader, device):
    all_predictions = []
    for model in models:
        model.eval()
        predictions = []
        with torch.no_grad():
            for batch in test_dataloader:
                b_input_ids, b_input_mask, _ = [item.to(device) for item in batch]
                outputs = model(b_input_ids, b_input_mask)
                predictions.extend(outputs.squeeze().cpu().numpy())
        all_predictions.append(predictions)
    
    avg_predictions = np.mean(all_predictions, axis=0)
    return avg_predictions

test_dataloader = prepare_dataloader(dataset, batch_size=1, test=True) # Use your actual test data here

avg_predictions = evaluate_models(models, test_dataloader, device)
true_scores = dataset[[" AwT score", "SoE score"]].values
mse = ((avg_predictions - true_scores) ** 2).mean(axis=0)
print(f"Average MSE: {mse}")


param_distributions = {
    'alpha' : [expon(scale=1.0), uniform(0.1,1.9)],
    'beta' : [loguniform(1e-3,1e1),uniform(0.1,1.9)],
    'gamma' : [expon(scale=1.0),uniform(0.1,1.9)],
    'delta' : [uniform(0.1,1.9), expon(scale=1.0)]
}

random_search=RandomizedSearchCV(
    estimator=evaluate_hyperparameters(),
    param_distributions=param_distributions,
    n_iter=100,
    scoring=make_scorer(custom_scorer, greater_is_better= False),
    cv=5,
    random_state=42
)





KeyboardInterrupt: 

In [68]:
# Example data
# X_train, y_train should be your training data
# x_train: array of pairs (SoE, AwT)
# y_train: corresponding true scores

#random_search.fit(x_train,y_train)


[0.9252603  0.77776223 0.96740234 0.68590736 1.0221708  0.77937067
 1.0072647  0.7213042  0.9108105  0.8359132  0.8974997  0.66318786
 0.99135613 0.8875499  0.9612308  0.44953763 1.0182903  0.69183224
 1.0016801  0.57063335 0.9987792  0.78549373 1.0043609  0.83751607
 0.81779003 0.69042003 0.9553925  0.7575235  1.0299163  0.6484913 ]
[[0.9  0.2 ]
 [0.9  0.5 ]
 [0.95 1.  ]
 [0.95 1.  ]
 [0.95 0.55]
 [0.95 0.55]
 [0.95 0.6 ]
 [0.95 1.  ]
 [0.95 0.4 ]
 [0.95 0.6 ]
 [0.95 0.55]
 [0.95 1.  ]
 [0.75 0.4 ]
 [0.75 0.55]
 [0.85 0.5 ]]


In [63]:

# Assuming true_scores is already defined correctly
true_scores = dataset[[" AwT score", " SoE score"]].values

# Check the shape of avg_predictions
print(f"Shape of avg_predictions: {avg_predictions.shape}")
print(f"Shape of true_scores: {true_scores.shape}")

# If avg_predictions is not 2D, reshape it accordingly
if avg_predictions.ndim == 1:
    avg_predictions = avg_predictions.reshape(-1, 2)

# Ensure avg_predictions has the same number of samples as true_scores
if avg_predictions.shape[0] != true_scores.shape[0]:
    raise ValueError("Number of samples in predictions and true scores do not match.")

mse = ((avg_predictions - true_scores) ** 2).mean(axis=0)
print(f"Average MSE: {mse}")

NameError: name 'avg_predictions' is not defined