In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertModel, AdamW
import torch.nn as nn
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
from scipy.stats import expon, loguniform, uniform
import os
import torch.nn.functional as F

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_data_from_csv(file_path):
    dataframe = pd.read_csv(file_path)
    return dataframe

cwd = os.getcwd()
file_path = os.path.join(cwd, r"data\initial_training_data\test.csv")
dataset = load_data_from_csv(file_path)
dataset

Unnamed: 0,text,AwT_score,SoE_score
0,Nutritional status and gene polymorphisms of o...,0.168196,0.655265
1,Thrombophilic gene polymorphism is known to be...,0.290110,0.833650
2,Whether adiponectin (ADIPOQ) polymorphisms are...,0.600797,0.998080
3,"Polycystic ovary syndrome (PCOS) is a common, ...",0.007601,0.353349
4,Endometriosis has been considered an epigeneti...,0.796838,0.387923
...,...,...,...
60,Histone deacetylase (HDAC) serves as a critica...,0.100000,0.050000
61,As one of the most prevalent chronic inflammat...,0.150000,0.050000
62,Human hemoglobin of G-Makassar and hemoglobin ...,0.500000,0.100000
63,"For many diseases, and cancer in particular, e...",0.100000,0.050000


In [4]:
def randomize_scores(scores, max_deviation= 0.05):
    randomized_scores = scores * (1 + np.random.uniform(-max_deviation,max_deviation,size=scores.shape))
    return np.clip(randomized_scores,0,1)

train_y = dataset[['AwT_score', 'SoE_score']].values


In [None]:
# def penalty_function_AwT(AwT, alpha=2, epsilon=1e-6):
#     return torch.exp(-(AwT + epsilon)**alpha)

# def reward_function_AwT(AwT, beta=2):
#     return (torch.exp(AwT - 0.5))**beta

# def penalty_function_SoE(SoE, gamma=2, epsilon=1e-6):
#     return torch.exp(-(SoE + epsilon)**gamma)

# def reward_function_SoE(SoE, delta=2):
#     return (torch.exp(SoE - 0.5))**delta

# def calculate_final_score(SoE, AwT, w_SoE=0.4, w_AwT=0.6, alpha=2, beta=2, gamma=2, delta=2):
#     base_score = w_SoE * SoE + w_AwT * AwT

#     AwT_score = torch.where(AwT < 0.5, base_score * penalty_function_AwT(AwT, alpha), base_score * reward_function_AwT(AwT, beta))
#     final_score = torch.where(SoE < 0.5, AwT_score * penalty_function_SoE(SoE, gamma), AwT_score * reward_function_SoE(SoE, delta))
    
#     return final_score

# def custom_scorer(y_true, y_pred):
#     scores = []
#     for true, pred in zip(y_true, y_pred):
#         score = calculate_final_score(true[1], true[0])  # Assuming y_true contains [AwT, SoE]
#         scores.append(score)
#     return mean_squared_error(y_true, scores)

# def evaluate_hyperparameters(alpha, beta, gamma, delta):
#     def model_evaluation(SoE, AwT):
#         return calculate_final_score(SoE, AwT, alpha=alpha, beta=beta, gamma=gamma, delta=delta)
#     return model_evaluation

In [5]:
# Load your dataset
# dataset = pd.read_csv("path_to_your_dataset.csv")
# Assuming 'text' column contains abstracts and 'score' column contains labels

# def custom_loss(predictions, targets, w_SoE=0.4, w_AwT=0.6, alpha=2, beta=2, gamma=2, delta=2):
#     mse_loss = F.mse_loss(predictions, targets)
    
#     AwT = targets[:, 0]
#     SoE = targets[:, 1]
#     final_scores = calculate_final_score(SoE, AwT, w_SoE, w_AwT, alpha, beta, gamma, delta)
    
#     reward_punishment_term = torch.tensor(final_scores, dtype=torch.float32, device=predictions.device)
    
#     # Integrate the reward/punishment term with the MSE loss
#     total_loss = mse_loss - reward_punishment_term.mean()
    
#     return total_loss

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
scores_to_randomize = dataset[["AwT_score", "SoE_score"]].values
randomized_scores = randomize_scores(scores_to_randomize)
dataset[["AwT_score", "SoE_score"]] = randomized_scores
train_data, test_data = train_test_split(dataset, test_size=0.4, random_state=42)

# Initializing tokenizer
tokenizer = BertTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")

# Function to prepare DataLoader
def prepare_dataloader(data, batch_size=4, test=False):
    inputs = tokenizer(data["text"].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")
    labels = torch.tensor(data[["AwT_score", "SoE_score"]].values).float()
    dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=not test)
    return dataloader

train_dataloader = prepare_dataloader(train_data,batch_size=15)
test_dataloader = prepare_dataloader(test_data, batch_size=15, test=True)

class BertForRegression(nn.Module):
    def __init__(self, model_name, hidden_size=768):
        super(BertForRegression, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.regressor = nn.Linear(hidden_size, 2)

    def forward(self, input_ids, attention_mask, token_type_ids=None, return_embeddings=False):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        if return_embeddings:
            return pooled_output
        return self.regressor(pooled_output)
    
def save_model(model, path):
    torch.save(model.state_dict(), path)

def train_model(train_dataloader, device, epochs = 12, model_index = 0):
    model = BertForRegression("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=2e-5) # test value  # torch.optim.AdamW
    criterion = nn.MSELoss()

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_dataloader:
            b_input_ids, b_input_mask, b_labels = [item.to(device) for item in batch] #co to jest b_input_ids
            optimizer.zero_grad()
            outputs = model(b_input_ids,b_input_mask) #co to jest????
            #loss = criterion(outputs.squeeze(),b_labels)
            loss = criterion(outputs,b_labels)  # Use the custom loss function #outputs = predictions , b_labels = targets 
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch + 1}, Loss: {avg_train_loss}")
        save_model(model, f"trained_model_{model_index}.pt")
    
    return model

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_models = 4
models = [train_model(train_dataloader, device, model_index=i) for i in range(num_models)]



def evaluate_models(models, test_dataloader, device):
    all_predictions = []
    true_scores = []
    for model in models:
        model.eval()
        predictions = []
        with torch.no_grad():
            for batch in test_dataloader:
                b_input_ids, b_input_mask, b_labels = [item.to(device) for item in batch]
                outputs = model(b_input_ids, b_input_mask)
                predictions.extend(outputs.cpu().numpy())
                true_scores.extend(b_labels.cpu().numpy())
        all_predictions.append(predictions)

    avg_predictions = np.mean(all_predictions, axis=0)
    true_scores = np.array(true_scores)
    return avg_predictions, all_predictions, true_scores


avg_predictions, all_predictions, true_scores = evaluate_models(models, test_dataloader, device)



OutOfMemoryError: CUDA out of memory. Tried to allocate 180.00 MiB. GPU 

In [None]:

avg_predictions, all_predictions, true_scores = evaluate_models(models, test_dataloader, device)

In [None]:
# Assuming true_scores is already defined correctly
true_scores = dataset.drop(test_data.index)
train_scores_true = dataset.drop(true_scores.index)
train_scores_true = train_scores_true.iloc[:,1:3]
print(train_scores_true)
#Check the shape of avg_predictions
print(f"Shape of avg_predictions: {avg_predictions.shape}")
print(f"Shape of true_scores: {train_scores_true.shape}")

mse = ((avg_predictions - train_scores_true) ** 2).mean(axis=0)
print(f"Average MSE: {mse}")

In [None]:
#test_data
print(test_data)
print(avg_predictions)

In [None]:
import argparse
import os
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel, AdamW
from modules.Classifier import DataPreparator
from modules.Classifier import BertForRegression
from modules.Classifier import ModelTrainer

def main():
    # parser = argparse.ArgumentParser(description='Trains model on training dataset containing abstracts, AwT and SoE scores as a csv file')
    # parser.add_argument('file_path', help='Path to the folder containing training dataset')
    # args = parser.parse_args()
    # file_path = args.file_path


    cwd = os.getcwd()
    file_path = os.path.join(cwd, r"data\initial_training_data\test.csv")
    dataset = DataPreparator.load_data_from_csv(file_path)

    train_y = dataset[['AwT_score', 'SoE_score']].values

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    scores_to_randomize = dataset[["AwT_score", "SoE_score"]].values
    randomized_scores = DataPreparator.randomize_scores(scores_to_randomize)
    dataset[["AwT_score", "SoE_score"]] = randomized_scores
    train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)

    tokenizer = BertTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")

    train_dataloader = DataPreparator.prepare_dataloader(train_data,batch_size=15)
    test_dataloader = DataPreparator.prepare_dataloader(test_data, batch_size=15, test=True)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    num_models = 5
    models = [ModelTrainer.train_model(train_dataloader, device, model_index=i) for i in range(num_models)]

    avg_predictions, all_predictions, true_scores = ModelTrainer.evaluate_models(models, test_dataloader, device)

    # Assuming true_scores is already defined correctly
    true_scores = dataset.drop(test_data.index)
    train_scores_true = dataset.drop(true_scores.index)
    train_scores_true = train_scores_true.iloc[:,1:3]
    print(train_scores_true)
    #Check the shape of avg_predictions
    print(f"Shape of avg_predictions: {avg_predictions.shape}")
    print(f"Shape of true_scores: {train_scores_true.shape}")

    mse = ((avg_predictions - train_scores_true) ** 2).mean(axis=0)
    print(f"Average MSE: {mse}")


if __name__ == "__main__":
    main()


