In [1]:
!pip install transformers datasets torch pandas scikit-learn openpyxl accelerate -U

import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch

# Load the Excel file from the local directory
file_path = r"C:\Users\Michael\Downloads\BERTmodel\final_comments.xlsx"  # Update with your local path
data = pd.read_excel(file_path)

# Use the correct column names based on the inspection
correct_comment_column = 'careprovidercomments'
mistake_score_column = 'Medical_Mistakes'
bert_mistake_column = 'BERT_MistakeMedical'

# Work with the first 1000 rows
data = data.iloc[:1000]

# Drop rows with NaN values in the mistake score column
data = data.dropna(subset=[mistake_score_column])

# Print unique values in the mistake score column after dropping NaNs
print("Unique values in the mistake score column after dropping NaNs:", data[mistake_score_column].unique())

# Function to clean punctuation
def clean_text(text):
    if isinstance(text, str):
        # Remove all punctuation except periods
        text = re.sub(r'[^\w\s.]', '', text)
    else:
        text = ''
    return text

# Clean the comments
data['cleaned_comment'] = data[correct_comment_column].apply(clean_text)

# Ensure mistake scores are 0 and 1
def ensure_mistake_scores(data, column):
    if not data[column].between(0, 1).all():
        raise ValueError("Mistake scores must be 0 or 1.")

ensure_mistake_scores(data, mistake_score_column)


# Split the data into training and testing sets (80/20 split)
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42,stratify = data[mistake_score_column])

# Print the sizes of the training and testing sets
print(f"Training set size: {len(train_data)} comments")
print(f"Test set size: {len(test_data)} comments")

# Free up memory by deleting the original dataframe
del data

# Convert the data to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)
dataset_dict = DatasetDict({"train": train_dataset, "test": test_dataset})

# Free up memory by deleting the Pandas dataframes
del train_data, test_data

# Initialize tokenizer with smaller max_length
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['cleaned_comment'], padding='max_length', truncation=True, max_length=256)

tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

def format_labels(examples):
    examples["labels"] = [int(label) for label in examples[mistake_score_column]]
    return examples

tokenized_datasets = tokenized_datasets.map(format_labels, batched=True)

# Free up memory by deleting the original datasets
del dataset_dict

# Print tokenized dataset sizes for debugging
print(f"Tokenized training set size: {len(tokenized_datasets['train'])} comments")
print(f"Tokenized test set size: {len(tokenized_datasets['test'])} comments")

# Load the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Training arguments with reduced batch sizes
training_args = TrainingArguments(
    output_dir=r"C:\Users\Michael\Downloads\BERT_MODEL_TRAINING_RESULTS-20240701T162037Z-002",  # Update with your local path
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    logging_strategy="steps",
    logging_steps=50,
)

# Define evaluation metrics
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='macro')
    acc = accuracy_score(p.label_ids, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics,
)

# Print expected number of steps per epoch
expected_steps_per_epoch = len(tokenized_datasets['train']) // training_args.per_device_train_batch_size
print(f"Expected number of steps per epoch: {expected_steps_per_epoch}")

# Train the model
trainer.train()

# Unified predict function
def predict(texts, model, tokenizer, batch_size=4):
    model.eval()
    if not isinstance(texts, list):
        texts = texts.tolist()

    predictions = []
    probabilities = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        encoded_inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=128, return_tensors='pt')

        with torch.no_grad():
            outputs = model(**encoded_inputs)

        logits = outputs.logits
        batch_probabilities = torch.nn.functional.softmax(logits, dim=-1)
        batch_predictions = torch.argmax(batch_probabilities, dim=-1)

        predictions.extend(batch_predictions.cpu().numpy())
        probabilities.extend(batch_probabilities.cpu().numpy())

    return np.array(predictions), np.array(probabilities)

def evaluate_model(test_dataset, model, tokenizer):
    texts = test_dataset['cleaned_comment']
    true_labels = test_dataset[mistake_score_column].tolist()

    predictions, probabilities = predict(texts, model, tokenizer)

    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='macro')
    accuracy = accuracy_score(true_labels, predictions)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    return predictions, probabilities

# Convert the test dataset to a pandas DataFrame
test_data = tokenized_datasets['test'].to_pandas()

# Evaluate the model
predictions, probabilities = evaluate_model(test_data, model, tokenizer)

# Inspect the predictions and their probabilities

for i in range(len(test_data)):
    print(f"Text: {test_data['cleaned_comment'].iloc[i]}")
    print(f"True Label: {test_data[mistake_score_column].iloc[i]}")
    print(f"Predicted Label: {predictions[i]}")
    print(f"Probabilities: {probabilities[i]}")
    print()

# Re-load the full dataset including comments without mistake scores
full_data = pd.read_excel(file_path)

# Clean the comments
full_data['cleaned_comment'] = full_data[correct_comment_column].apply(clean_text)

# Get predictions
comments_to_predict = full_data['cleaned_comment'].tolist()
predicted_labels, predicted_probabilities = predict(comments_to_predict, model, tokenizer)

# Add predicted labels to the data in the BERT_Mistake column
full_data[bert_mistake_column] = predicted_labels

# List of columns to keep
columns_to_keep =['ID', 'unidentifiableid', 'Combined_Sentiment', 'Combined_Wait',
       'Updated_Wait',
       'BERT_Sentiment', 'BERT_Wait', 'BERT_MistakeMedical', 'careprovidercomments',
       'Medical_Mistakes', 'Clerical_Mistakes', 'Communication_Mistakes']
full_data = full_data[columns_to_keep]

# Save the full data with predictions back to the original Excel file
full_data.to_excel(file_path, index=False)  # Overwrite the original file

Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl.metadata (13 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ----------------------- ---------------- 6.6/11.0 MB 33.5 MB/s eta 0:00:01
   ---------------------------------------- 11.0/11.0 MB 31.2 MB/s eta 0:00:00
Installing collected packages: scikit-learn, datasets
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.5.1
    Uninstalling scikit-learn-1.5.1:
      Successfully uninstalled scikit-learn-1.5.1
  Attempting uninstall: datasets
    Found existing installation: datasets 3.0.0
    Uninstalling datasets-3.0.0:
      Successfully uninstalled datasets-3.0.0
Successfully installed datasets-3.0.0 scikit-learn-1.5.2
Unique values in the mi



Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Tokenized training set size: 800 comments
Tokenized test set size: 200 comments


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Expected number of steps per epoch: 400


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1311,0.122913,0.98,0.494949,0.49,0.5
2,0.0002,0.141444,0.98,0.494949,0.49,0.5
3,0.0001,0.209911,0.965,0.672821,0.637676,0.737245


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.9700
Precision: 0.6615
Recall: 0.7398
F1 Score: 0.6923
Text: The Fellow that that work with me had some nice phrases not technical to discribe my vision that I didnt write down and now I cant remember them.  If he could call 801.366.7413 or just write them to me ralph.morrisonurs.org I would be grateful. Thanks.
True Label: 0.0
Predicted Label: 0
Probabilities: [9.9987483e-01 1.2514905e-04]

Text: Dr. Shapiro is great
True Label: 0.0
Predicted Label: 0
Probabilities: [9.999342e-01 6.574014e-05]

Text: because the appointment was quite openthe physician visit included an undergraduate from another institutionI did not feel that I had an opportunity to discuss sensitive issues.  I would at least like to have been asked about the shadowingI know this is a teaching hospital and I expect some  of that but this was not even someone from the U
True Label: 0.0
Predicted Label: 0
Probabilities: [9.999225e-01 7.745231e-05]

Text: Dr. Shprecher was very patient thorough and listened i