In [1]:
!pip install transformers datasets torch pandas scikit-learn openpyxl accelerate -U

import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch

# Load the Excel file from the local directory
file_path = r"C:\Users\Michael\Downloads\BERTmodel\comments.xlsx"  # Update with your local path
data = pd.read_excel(file_path)

# Use the correct column names based on the inspection
correct_comment_column = 'careprovidercomments'
sentiment_score_column = 'Combined_Sentiment'
bert_sentiment_column = 'BERT_Sentiment'

# Work with the first 1000 rows
data = data.iloc[:1000]

# Drop rows with NaN values in the sentiment score column
data = data.dropna(subset=[sentiment_score_column])

# Print unique values in the sentiment score column after dropping NaNs
print("Unique values in the sentiment score column after dropping NaNs:", data[sentiment_score_column].unique())

# Scale sentiment scores: 1-2 as 1, 3 as 2, 4-5 as 3
def scale_sentiment_score(score):
    if score in [1, 2]:
        return 1
    elif score == 3:
        return 2
    elif score in [4, 5]:
        return 3

data[sentiment_score_column] = data[sentiment_score_column].apply(scale_sentiment_score)

# Print unique values in the sentiment score column after scaling
print("Unique values in the sentiment score column after scaling:", data[sentiment_score_column].unique())

# Function to clean punctuation
def clean_text(text):
    if isinstance(text, str):
        # Remove all punctuation except periods
        text = re.sub(r'[^\w\s.]', '', text)
    else:
        text = ''
    return text

# Clean the comments
data['cleaned_comment'] = data[correct_comment_column].apply(clean_text)

# Ensure sentiment scores are between 1 and 3
def ensure_sentiment_scores(data, column):
    if not data[column].between(1, 3).all():
        raise ValueError("Sentiment scores must be between 1 and 3.")

ensure_sentiment_scores(data, sentiment_score_column)

# Convert sentiment scores to 0-based index
data[sentiment_score_column] = data[sentiment_score_column] - 1

# Split the data into training and testing sets (80/20 split)
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Print the sizes of the training and testing sets
print(f"Training set size: {len(train_data)} comments")
print(f"Test set size: {len(test_data)} comments")

# Free up memory by deleting the original dataframe
del data

# Convert the data to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)
dataset_dict = DatasetDict({"train": train_dataset, "test": test_dataset})

# Free up memory by deleting the Pandas dataframes
del train_data, test_data

# Initialize tokenizer with smaller max_length
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['cleaned_comment'], padding='max_length', truncation=True, max_length=64)

tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

def format_labels(examples):
    examples["labels"] = [int(label) for label in examples[sentiment_score_column]]
    return examples

tokenized_datasets = tokenized_datasets.map(format_labels, batched=True)

# Free up memory by deleting the original datasets
del dataset_dict

# Print tokenized dataset sizes for debugging
print(f"Tokenized training set size: {len(tokenized_datasets['train'])} comments")
print(f"Tokenized test set size: {len(tokenized_datasets['test'])} comments")

# Load the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Training arguments with reduced batch sizes
training_args = TrainingArguments(
    output_dir=r"C:\Users\Michael\Downloads\BERT_MODEL_TRAINING_RESULTS-20240701T162037Z-002",  # Update with your local path
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    logging_strategy="steps",
    logging_steps=50,
)

# Define evaluation metrics
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    acc = accuracy_score(p.label_ids, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics,
)

# Print expected number of steps per epoch
expected_steps_per_epoch = len(tokenized_datasets['train']) // training_args.per_device_train_batch_size
print(f"Expected number of steps per epoch: {expected_steps_per_epoch}")

# Train the model
trainer.train()

# Unified predict function
def predict(texts, model, tokenizer, batch_size=4):
    model.eval()
    if not isinstance(texts, list):
        texts = texts.tolist()

    predictions = []
    probabilities = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        encoded_inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=128, return_tensors='pt')

        with torch.no_grad():
            outputs = model(**encoded_inputs)

        logits = outputs.logits
        batch_probabilities = torch.nn.functional.softmax(logits, dim=-1)
        batch_predictions = torch.argmax(batch_probabilities, dim=-1)

        predictions.extend(batch_predictions.cpu().numpy())
        probabilities.extend(batch_probabilities.cpu().numpy())

    return np.array(predictions), np.array(probabilities)

def evaluate_model(test_dataset, model, tokenizer):
    texts = test_dataset['cleaned_comment']
    true_labels = test_dataset[sentiment_score_column].tolist()

    predictions, probabilities = predict(texts, model, tokenizer)

    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
    accuracy = accuracy_score(true_labels, predictions)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    return predictions, probabilities

# Convert the test dataset to a pandas DataFrame
test_data = tokenized_datasets['test'].to_pandas()

# Evaluate the model
predictions, probabilities = evaluate_model(test_data, model, tokenizer)

# Inspect the predictions and their probabilities

for i in range(len(test_data)):
    print(f"Text: {test_data['cleaned_comment'].iloc[i]}")
    print(f"True Label: {test_data[sentiment_score_column].iloc[i] + 1}")
    print(f"Predicted Label: {predictions[i] + 1}")
    print(f"Probabilities: {probabilities[i]}")
    print()

# Re-load the full dataset including comments without sentiment scores
full_data = pd.read_excel(file_path)

# Clean the comments
full_data['cleaned_comment'] = full_data[correct_comment_column].apply(clean_text)

# Predict BERT sentiment scores for all comments
comments_to_predict = full_data['cleaned_comment'].tolist()
predicted_labels, predicted_probabilities = predict(comments_to_predict, model, tokenizer)

# Add predicted labels to the data in the BERT_Sentiment column
full_data[bert_sentiment_column] = predicted_labels + 1  # Convert back to 1-based index

# List of columns to keep
columns_to_keep =['ID', 'unidentifiableid', 'Combined_Sentiment', 'Combined_Wait',
       'Updated_Wait',
       'BERT_Sentiment', 'BERT_Wait', 'BERT_MistakeMedical',
       'BERT_MistakeClerical', 'BERT_MistakeMedicalClerical',
       'BERT_MistakeCommunication', 'BERT_MistakeAll', 'careprovidercomments',
       'Medical_Mistakes', 'Clerical_Mistakes', 'Communication_Mistakes']
# Drop columns not in the list
full_data = full_data[columns_to_keep]

# Save the combined data with predictions back to the original Excel file
full_data.to_excel(file_path, index=False)  # Overwrite the original file




FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\Michael\\Downloads\\BERTmodel\\comments.xlsx'

In [None]:
from google.colab import output
output.enable_custom_widget_manager()

Support for third party widgets will remain active for the duration of the session. To disable support:

In [None]:
from google.colab import output
output.disable_custom_widget_manager()