In [3]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pyarrow.parquet as pq
import pyarrow.ipc as ipc
from datasets import Dataset

# Load the Med-BERT model and tokenizer
model_name = 'emilyalsentzer/Bio_ClinicalBERT'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Load the dataset from the saved arrow file
dataset = Dataset.from_file('/workspaces/NLP-Polimi-Project/Practice Models/Erfan/Data/train/data-00000-of-00001.arrow')

# Convert to Pandas DataFrame for analysis
df = dataset.to_pandas()

print(f"Dataset contains {len(df)} documents.")
print(df.head())


# Split the data into train and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Tokenize the questions
tokenized_train_questions = train_data['question'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
tokenized_test_questions = test_data['question'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))

# Generate answers using Med-BERT for train and test sets
generated_train_answers = []
generated_test_answers = []

for tokens in tokenized_train_questions:
    input_ids = torch.tensor([tokens])
    with torch.no_grad():
        outputs = model(input_ids)
        embeddings = outputs.last_hidden_state.mean(dim=1)
        generated_train_answers.append(embeddings.tolist())

for tokens in tokenized_test_questions:
    input_ids = torch.tensor([tokens])
    with torch.no_grad():
        outputs = model(input_ids)
        embeddings = outputs.last_hidden_state.mean(dim=1)
        generated_test_answers.append(embeddings.tolist())

# Calculate cosine similarity between generated answers and ground truth answers for train and test sets
ground_truth_train_answers = train_data['answer']
ground_truth_test_answers = test_data['answer']

cosine_similarities_train = []
cosine_similarities_test = []

for generated_answer, ground_truth_answer in zip(generated_train_answers, ground_truth_train_answers):
    cosine_similarities_train.append(cosine_similarity([generated_answer], [ground_truth_answer])[0][0])

for generated_answer, ground_truth_answer in zip(generated_test_answers, ground_truth_test_answers):
    cosine_similarities_test.append(cosine_similarity([generated_answer], [ground_truth_answer])[0][0])

# Other performance metrics for train and test sets
# Add your own metrics here

# Plotting for train and test sets
plt.figure(figsize=(8, 4))

plt.subplot(1, 2, 1)
plt.hist(cosine_similarities_train, bins=10)
plt.xlabel('Cosine Similarity')
plt.ylabel('Frequency')
plt.title('Distribution of Cosine Similarity (Train)')

plt.subplot(1, 2, 2)
plt.hist(cosine_similarities_test, bins=10)
plt.xlabel('Cosine Similarity')
plt.ylabel('Frequency')
plt.title('Distribution of Cosine Similarity (Test)')

plt.tight_layout()
plt.show()

# Add other plots here for train and test sets

NameError: name 'load_dataset' is not defined

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('dataset.csv')

# Load the Med-BERT pretrained model
model = SentenceTransformer('med-bert-base-uncased')

# Encode the questions using the Med-BERT model
question_embeddings = model.encode(df['question'].tolist())

# Generate answers using the Med-BERT model
generated_answers = []
for question_embedding in question_embeddings:
    # TODO: Use the Med-BERT model to generate an answer for each question
    generated_answers.append(generated_answer)

# Calculate cosine similarity between generated answers and actual answers
cosine_similarities = cosine_similarity(question_embeddings, model.encode(df['answer'].tolist()))

# Measure the performance of the model using other methods
# TODO: Add other performance measurement methods here

# Plotting
# Plot the cosine similarities
plt.hist(cosine_similarities.flatten(), bins=10)
plt.xlabel('Cosine Similarity')
plt.ylabel('Frequency')
plt.title('Distribution of Cosine Similarities')
plt.show()

# TODO: Add other plots to showcase important aspects of the model's performance

In [4]:
import torch
from transformers import BertTokenizer, BertForQuestionAnswering

# Load the pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)

# Function to generate answer using BERT Q&A model
def generate_answer(question, context):
    # Tokenize the question and context
    inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors='pt')

    # Get the input IDs and attention mask
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    # Generate answer using the BERT Q&A model
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        start_scores = outputs.start_logits
        end_scores = outputs.end_logits

    # Find the start and end indices of the answer span
    start_index = torch.argmax(start_scores)
    end_index = torch.argmax(end_scores)

    # Convert the token indices to actual tokens
    tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze())
    answer = tokenizer.convert_tokens_to_string(tokens[start_index:end_index+1])

    return answer

# Example usage
context = "BERT (Bidirectional Encoder Representations from Transformers) is a pre-trained model for natural language processing."
question = "What is BERT?"
answer = generate_answer(question, context)
print(answer)

TypeError: int() argument must be a string, a bytes-like object or a real number, not 'list'