In [44]:
import torch
from transformers import BertModel, BertTokenizer
from torch.utils.data import DataLoader,TensorDataset
import re
import numpy as np

# Define the regression model class (as you did before)
class BertForRegression(torch.nn.Module):
    def __init__(self, model_name, hidden_size=768):
        super(BertForRegression, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.regressor = torch.nn.Linear(hidden_size, 1)

    def forward(self, input_ids, attention_mask, token_type_ids = None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        regression_output = self.regressor(pooled_output)
        return regression_output

def clean_text(text):
    # Remove everything between angle brackets
    clean_text = re.sub(r'<[^>]*>', '', text)
    # Remove newline and other extra whitespace characters
    clean_text = clean_text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').strip()
    # Replace multiple spaces with a single space
    clean_text = re.sub(r'\s+', ' ', clean_text)
    return clean_text

# Initialize the model with the same model name used during training
model = BertForRegression("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")

# Load the state dictionary
model.load_state_dict(torch.load(r"C:\Users\Hubert\Documents\GitHub\researcher\Model\finetuned_PMBERT_regression.pth"))

# If the model was trained on a GPU and you're loading on CPU, use:
# model.load_state_dict(torch.load("finetuned_PMBERT_regression.pth", map_location=torch.device('cpu')))

# Set the model to evaluation mode if you're making predictions
model.eval()

# Step 1: Read the text file
with open(r'C:\Users\Hubert\Documents\GitHub\researcher\Model\data\new_data.txt', 'r', encoding='utf-8') as file:
    text = file.read()

text = clean_text(text=text)

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")

# Step 2: Tokenize the text
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)

# Step 3: Convert to DataLoader
dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'])
dataloader = DataLoader(dataset, batch_size=1)  # Batch size of 1 since we're predicting one text

# Step 4: Make Predictions
model.eval()  # Ensure the model is in eval mode
predictions = []
with torch.no_grad():
    for batch in dataloader:
        b_input_ids, b_input_mask = batch
        outputs = model(b_input_ids, b_input_mask)
        predictions.append(outputs.squeeze().item())

# Step 5: Interpret Results
# If you are performing regression, the output is already in `predictions`
# If you need to classify based on a threshold:
threshold = 0.7  # Example threshold
classified_outputs = ["High" if pred >= threshold else "Low" for pred in predictions]

print(classified_outputs)
print(predictions)



['High']
[0.8080760836601257]


In [42]:

import nltk
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans

nltk.download('punkt')
from nltk.tokenize import sent_tokenize

# Assuming the tokenizer is for the same model
tokenizer = BertTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")

def get_embeddings(text, model):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    # Use the model to get embeddings
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():  # Disable gradient computation
        embeddings = model(input_ids=input_ids, attention_mask=attention_mask, return_embeddings=True)
    
    return embeddings.detach().numpy()  # Convert PyTorch tensor to NumPy array for further processing


def extractive_summarization(text, num_sentences=90):
    sentences = sent_tokenize(text)
    sentence_embeddings = np.vstack([get_embeddings(sent)[0].mean(axis=0) for sent in sentences])
    # Clustering sentences
    num_clusters = min(num_sentences, len(sentences))  # Ensuring we don't exceed number of sentences
    kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(sentence_embeddings)
    centroids = kmeans.cluster_centers_

    # Selecting one sentence per cluster (closest to centroid)
    summarized_sentences = []
    for centroid in centroids:
        similarities = cosine_similarity([centroid], sentence_embeddings)
        best_sentence = np.argmax(similarities)
        summarized_sentences.append(sentences[best_sentence])

    return " ".join(summarized_sentences)

# Example usage
summary = extractive_summarization(text)
print(summary)



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hubert\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Thalassemia is a hemoglobinopathy caused by variations (including mutations, deletions, duplications, and gene rearrangements) in alpha (α)- and beta (β)-globin gene clusters that disrupt the balance of synthesis between the α- and β-globin chains which compose hemoglobin [2]. Thalassemia is a hemoglobinopathy caused by variations (including mutations, deletions, duplications, and gene rearrangements) in alpha (α)- and beta (β)-globin gene clusters that disrupt the balance of synthesis between the α- and β-globin chains which compose hemoglobin [2]. Thalassemia is a hemoglobinopathy caused by variations (including mutations, deletions, duplications, and gene rearrangements) in alpha (α)- and beta (β)-globin gene clusters that disrupt the balance of synthesis between the α- and β-globin chains which compose hemoglobin [2]. Thalassemia is a hemoglobinopathy caused by variations (including mutations, deletions, duplications, and gene rearrangements) in alpha (α)- and beta (β)-globin gene 



In [46]:
get_predictions(text)


array([[0.52080613]], dtype=float32)