In [2]:
import pandas as pd
import numpy as np
import re
import glob
import os
import random

# Corrected file path using raw string
html_path = r"C:\Users\Hubert\Documents\GitHub\researcher\python\results\infertility AND genetic variant\html_bodies"


def read_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def clean_text(text):
    # Remove everything between angle brackets
    clean_text = re.sub(r'<[^>]*>', '', text)
    # Remove newline and other extra whitespace characters
    clean_text = clean_text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').strip()
    # Replace multiple spaces with a single space
    clean_text = re.sub(r'\s+', ' ', clean_text)
    return clean_text

# Get list of all HTML files in the directory
html_files = glob.glob(os.path.join(html_path, '*.html'))

# Create a dictionary to store lists of cleaned text and another key
html_dict = {"text": [], "score": []}

# Process each HTML file
for html_file in html_files:
    # Read the HTML file
    html_text = read_html(html_file)

    # Clean the text
    cleaned_text = clean_text(html_text)

    # Add the cleaned text to the list in the dictionary
    html_dict["text"].append(cleaned_text)

    # Add corresponding entries to the new key (e.g., file names)
    #another_value = random.randint(0,10) 
    #html_dict["score"].append(another_value)
    html_dict["score"] = [
        0.85, 0.78, 0.60, 0.75, 0.70, 0.90]

html_dict

#add a way to create or read a dictionary with scores 
# add a way to combined dictionaries 



{'text': ['Introduction Neuromuscular diseases comprise a spectrum of disorders affecting motor neurons in the spinal cord, sensory neurons in the dorsal root ganglia, peripheral nerves, neuromuscular junction and/or skeletal muscles. Cranial nerves (and their nuclei) as well as components of the vegetative system can be also affected. These diseases mainly compromise motricity and sensation and may be a consequence of many different causes including acquired and genetic factors. While for most of the acquired neuromuscular diseases a progressive number of therapies have been discovered and are now used in clinical practice, there is still a significant unmet need for the development of curative treatments for inherited and degenerative forms. This landscape is rapidly changing though, with therapies already available in clinical practice and others under evaluation in clinical trials or preclinical studies. The most compelling therapeutic approaches for inherited disorders can be deli

In [3]:
# Create a DataFrame
dataset = pd.DataFrame(html_dict)

print(dataset)

                                                text  score
0  Introduction Neuromuscular diseases comprise a...   0.85
1  Introduction Genome sequencing (GS) is increas...   0.78
2  Introduction In mammals, ejaculated sperm migr...   0.60
3  WEDNESDAY, MAY 8 SESSION 1–TRAUMA Room: Potoma...   0.75
4  Introduction The development of germ cells and...   0.70
5  Background The issue of antibiotic resistance ...   0.90


In [4]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertModel, AdamW
import torch.nn as nn
from sklearn.model_selection import train_test_split

# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Split dataset into train and test sets
train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)

# Load PubMedBERT tokenizer
tokenizer = BertTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")

# Tokenize input data
train_inputs = tokenizer(train_data["text"].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")
test_inputs = tokenizer(test_data["text"].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")

# Convert labels to tensor
train_labels = torch.tensor(train_data["score"].tolist()).to(device)
test_labels = torch.tensor(test_data["score"].tolist()).to(device)

# Create DataLoader for train and test sets
train_dataset = TensorDataset(train_inputs['input_ids'].to(device), train_inputs['attention_mask'].to(device), train_labels)
test_dataset = TensorDataset(test_inputs['input_ids'].to(device), test_inputs['attention_mask'].to(device), test_labels)

train_dataloader = DataLoader(train_dataset, batch_size=6, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=6)

# Define the regression model
class BertForRegression(nn.Module):
    def __init__(self, model_name, hidden_size=768):
        super(BertForRegression, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.regressor = nn.Linear(hidden_size, 1)

    def forward(self, input_ids, attention_mask, token_type_ids=None, return_embeddings=False):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        if return_embeddings:
            return pooled_output  # Return embeddings directly
        return self.regressor(pooled_output)

# Initialize the model
model = BertForRegression("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")
model.to(device)  # Move the model to CUDA

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.MSELoss()

# Training the model
epochs = 16

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        outputs = model(b_input_ids, b_input_mask)
        loss = criterion(outputs.squeeze(), b_labels.float())
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}, Loss: {avg_train_loss}")

# Evaluate the fine-tuned model on the test set
model.eval()
predictions = []
with torch.no_grad():
    for batch in test_dataloader:
        b_input_ids, b_input_mask, _ = batch
        outputs = model(b_input_ids, b_input_mask)
        predictions.extend(outputs.squeeze().cpu().numpy())

# Calculate Mean Squared Error
mse = ((predictions - test_labels.cpu().numpy()) ** 2).mean()
print(f"MSE: {mse}")

# Save the fine-tuned model
torch.save(model.state_dict(), "finetuned_PMBERT_regression.pth")






Epoch 1, Loss: 0.5638186931610107
Epoch 2, Loss: 0.08492625504732132
Epoch 3, Loss: 0.006384187377989292
Epoch 4, Loss: 0.046463172882795334
Epoch 5, Loss: 0.08020128309726715
Epoch 6, Loss: 0.032187849283218384
Epoch 7, Loss: 0.001791244838386774
Epoch 8, Loss: 0.0121929831802845
Epoch 9, Loss: 0.027144072577357292
Epoch 10, Loss: 0.021686838939785957
Epoch 11, Loss: 0.004157875664532185
Epoch 12, Loss: 0.001009250758215785
Epoch 13, Loss: 0.0013823489425703883
Epoch 14, Loss: 0.003959754481911659
Epoch 15, Loss: 0.007111010141670704
Epoch 16, Loss: 0.009410269558429718
MSE: 0.017056215554475784


<torch.utils.data.dataloader.DataLoader at 0x12f4977a850>