In [1]:
# Cell 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer
import torch
from torch.utils.data import Dataset
import warnings
warnings.filterwarnings('ignore')

In [2]:
from google.colab import files
uploaded = files.upload()

Saving complete_set.csv to complete_set.csv


In [3]:

# Cell 2: Define the Dataset class
class CommentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.FloatTensor(label)
        }

# Cell 3: Load and prepare data
df = pd.read_csv('complete_set.csv')
print(f"Total number of samples: {len(df)}")

# Split into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")

# Prepare features and labels
train_texts = train_df['comment'].values
test_texts = test_df['comment'].values
train_labels = train_df.iloc[:, 3:].values  # All columns except Comment
test_labels = test_df.iloc[:, 3:].values

Total number of samples: 35047
Training samples: 28037
Test samples: 7010


In [None]:
# Cell 4: Initialize tokenizer and model
model_name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(df.columns)-3,
    problem_type="multi_label_classification"
)

# Cell 5: Create datasets
train_dataset = CommentDataset(train_texts, train_labels, tokenizer)
test_dataset = CommentDataset(test_texts, test_labels, tokenizer)

# Cell 6: Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# Cell 7: Initialize trainer and train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

print("Starting training...")
trainer.train()
print("Training completed!")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss


In [None]:
tag_names = df.columns[3:].tolist()


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
import torch

def evaluate_metrics(predictions, test_labels, tag_names, threshold=0.1):
    """
    Calculate comprehensive classification metrics for all classes and per class.

    Args:
        predictions: Raw model predictions (logits)
        test_labels: True labels
        tag_names: List of class names
        threshold: Probability threshold for positive prediction
    """
    # Convert logits to probabilities and then to binary predictions
    probabilities = torch.sigmoid(torch.tensor(predictions)).numpy()
    preds = (probabilities > threshold).astype(int)

    # Overall accuracy across all samples and classes
    overall_accuracy = accuracy_score(test_labels.flatten(), preds.flatten())

    # Per-class metrics
    class_metrics = []
    for i in range(len(tag_names)):
        # Per-class accuracy
        class_accuracy = accuracy_score(test_labels[:, i], preds[:, i])

        # True positives, false positives, true negatives, false negatives
        tp = np.sum((test_labels[:, i] == 1) & (preds[:, i] == 1))
        fp = np.sum((test_labels[:, i] == 0) & (preds[:, i] == 1))
        tn = np.sum((test_labels[:, i] == 0) & (preds[:, i] == 0))
        fn = np.sum((test_labels[:, i] == 1) & (preds[:, i] == 0))

        # Calculate precision, recall, f1
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        # Support (number of true instances)
        support = np.sum(test_labels[:, i])

        class_metrics.append({
            'class': tag_names[i],
            'accuracy': class_accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'support': int(support),
            'true_pos': int(tp),
            'false_pos': int(fp),
            'true_neg': int(tn),
            'false_neg': int(fn)
        })

    # Calculate macro averages
    macro_accuracy = np.mean([m['accuracy'] for m in class_metrics])
    macro_precision = np.mean([m['precision'] for m in class_metrics])
    macro_recall = np.mean([m['recall'] for m in class_metrics])
    macro_f1 = np.mean([m['f1'] for m in class_metrics])

    # Print comprehensive report
    print("\nCLASSIFICATION REPORT")
    print("=" * 100)
    print(f"Overall Accuracy: {overall_accuracy:.4f}")
    print(f"Macro Accuracy: {macro_accuracy:.4f}")
    print("\nMacro Averages:")
    print(f"Precision: {macro_precision:.4f}")
    print(f"Recall: {macro_recall:.4f}")
    print(f"F1-Score: {macro_f1:.4f}")

    print("\nPer-Class Metrics:")
    print("-" * 100)
    print(f"{'Class':<30} {'Accuracy':>10} {'Precision':>10} {'Recall':>10} {'F1-Score':>10} {'Support':>10}")
    print("-" * 100)

    # Sort classes by F1 score for display
    sorted_metrics = sorted(class_metrics, key=lambda x: x['f1'], reverse=True)

    for metric in sorted_metrics:
        print(f"{metric['class'][:30]:<30} "
              f"{metric['accuracy']:>10.4f} "
              f"{metric['precision']:>10.4f} "
              f"{metric['recall']:>10.4f} "
              f"{metric['f1']:>10.4f} "
              f"{metric['support']:>10d}")

    print("\nDetailed Statistics for Top 5 Classes:")
    print("-" * 100)
    for metric in sorted_metrics[:5]:
        print(f"\nClass: {metric['class']}")
        print(f"True Positives: {metric['true_pos']}")
        print(f"False Positives: {metric['false_pos']}")
        print(f"True Negatives: {metric['true_neg']}")
        print(f"False Negatives: {metric['false_neg']}")

    return {
        'overall_accuracy': overall_accuracy,
        'macro_accuracy': macro_accuracy,
        'macro_precision': macro_precision,
        'macro_recall': macro_recall,
        'macro_f1': macro_f1,
        'class_metrics': class_metrics
    }

def evaluate_on_test_set(trainer, test_texts, test_labels, tokenizer, tag_names, threshold=0.1):
    """
    Evaluate model on test set
    """
    # Create test dataset
    dataset = CommentDataset(test_texts, test_labels, tokenizer)
    predictions = trainer.predict(dataset).predictions

    # Calculate and return metrics
    return evaluate_metrics(predictions, test_labels, tag_names, threshold)

# Example usage
metrics = evaluate_on_test_set(trainer, test_texts, test_labels, tokenizer, tag_names, threshold=0.1)

In [None]:
# Example Usage
metrics = evaluate_on_test_set(trainer, test_texts, test_labels, tokenizer, tag_names, threshold=0.2)


In [None]:
def predict_top3_tags(texts, model, tokenizer, threshold=0.0):
    # Get predictions
    dataset = CommentDataset(texts, np.zeros((len(texts), model.config.num_labels)), tokenizer)
    predictions = trainer.predict(dataset)
    probabilities = torch.sigmoid(torch.tensor(predictions.predictions)).numpy()

    # Initialize arrays with fixed shapes
    n_samples = len(texts)
    n_labels = probabilities.shape[1]
    predictions_binary = np.zeros((n_samples, n_labels))

    # Process each sample
    for i in range(n_samples):
        # Get indices where probability is above threshold
        above_threshold = np.where(probabilities[i] >= threshold)[0]

        # Take up to 3 highest probabilities that meet threshold
        if len(above_threshold) > 0:
            # Sort by probability among those above threshold
            sorted_indices = above_threshold[np.argsort(-probabilities[i][above_threshold])]
            selected_indices = sorted_indices[:3]  # Take up to 3
            predictions_binary[i, selected_indices] = 1

    return predictions_binary, probabilities

def calculate_modified_accuracy(y_true, y_pred):
    correct = 0
    total = len(y_true)

    for i in range(total):
        true_indices = set(np.where(y_true[i] == 1)[0])
        pred_indices = set(np.where(y_pred[i] == 1)[0])

        # Case 1: Both true and predicted are empty
        if len(true_indices) == 0 and len(pred_indices) == 0:
            correct += 1
            continue

        # Case 2: At least one is non-empty
        if len(true_indices) > 0 or len(pred_indices) > 0:
            intersection = true_indices.intersection(pred_indices)
            union = true_indices.union(pred_indices)
            if len(intersection) > 0:  # At least one correct prediction
                correct += 1

    return correct / total

# Evaluate on test set
print("\nEvaluating on test set...")
predictions_binary, probabilities = predict_top3_tags(test_texts, model, tokenizer, threshold=0.1)

# Calculate and print accuracy
accuracy = calculate_modified_accuracy(test_labels, predictions_binary)
print(f"\nModified Accuracy on test set (threshold=0.0): {accuracy:.4f}")

# Print detailed statistics
print("\nDetailed Statistics:")
tag_names = df.columns[3:].tolist()

# Count various cases
total_samples = len(test_texts)
empty_true = 0
empty_pred = 0
both_empty = 0
correct_matches = 0
total_matches = 0

for i in range(total_samples):
    true_indices = set(np.where(test_labels[i] == 1)[0])
    pred_indices = set(np.where(predictions_binary[i] == 1)[0])

    if len(true_indices) == 0:
        empty_true += 1
    if len(pred_indices) == 0:
        empty_pred += 1
    if len(true_indices) == 0 and len(pred_indices) == 0:
        both_empty += 1

    intersection = true_indices.intersection(pred_indices)
    if len(intersection) > 0:
        correct_matches += 1
    total_matches += 1

print(f"\nTotal samples: {total_samples}")
print(f"Samples with no true tags: {empty_true} ({empty_true/total_samples*100:.1f}%)")
print(f"Samples with no predictions above threshold: {empty_pred} ({empty_pred/total_samples*100:.1f}%)")
print(f"Samples with both true and predicted empty: {both_empty} ({both_empty/total_samples*100:.1f}%)")
print(f"Samples with at least one correct match: {correct_matches} ({correct_matches/total_samples*100:.1f}%)")

# Show examples
print("\nDetailed examples from test set:")
for i in range(5):
    print(f"\nExample {i+1}:")
    print(f"Text: {test_texts[i]}")

    # True tags
    true_indices = np.where(test_labels[i] == 1)[0]
    print("True tags:", [tag_names[j] for j in true_indices] if len(true_indices) > 0 else "None")

    # Predicted tags
    pred_indices = np.where(predictions_binary[i] == 1)[0]
    if len(pred_indices) > 0:
        print("Predicted tags:")
        for idx in pred_indices:
            print(f"- {tag_names[idx]} (confidence: {probabilities[i][idx]:.2f})")
    else:
        print("Predicted tags: None")

    # Show match status
    intersection = set(true_indices).intersection(set(pred_indices))
    if len(true_indices) == 0 and len(pred_indices) == 0:
        print("Status: Correct (both empty)")
    elif len(intersection) > 0:
        print("Status: Correct (matching tags found)")
    else:
        print("Status: Incorrect (no matching tags)")
    print("-" * 80)

# Calculate threshold statistics
above_threshold = probabilities >= 0.1
predictions_per_sample = np.sum(above_threshold, axis=1)

print("\nThreshold Statistics:")
print(f"Average predictions per sample above threshold: {np.mean(predictions_per_sample):.2f}")
print(f"Samples with no predictions above threshold: {np.sum(predictions_per_sample == 0)}")
print(f"Samples with 1-3 predictions above threshold: {np.sum((predictions_per_sample > 0) & (predictions_per_sample <= 3))}")
print(f"Samples with >3 predictions above threshold: {np.sum(predictions_per_sample > 3)}")

In [None]:
def analyze_tag_matches(y_true, y_pred, probabilities, tag_names):
    """
    Analyze different levels of tag matching accuracy
    """
    total_samples = len(y_true)

    # Initialize counters
    stats = {
        'at_least_one_match': 0,
        'at_least_two_matches': 0,
        'exact_three_matches': 0,
        'total_samples': total_samples,
        'match_distribution': {0: 0, 1: 0, 2: 0, 3: 0},
        'true_tag_counts': {0: 0, 1: 0, 2: 0, 3: 0},
        'pred_tag_counts': {0: 0, 1: 0, 2: 0, 3: 0}
    }

    # Detailed examples for each case
    examples = {
        'two_matches': [],
        'three_matches': [],
        'no_matches': []
    }

    for i in range(total_samples):
        true_indices = set(np.where(y_true[i] == 1)[0])
        pred_indices = set(np.where(y_pred[i] == 1)[0])

        # Count number of true and predicted tags
        num_true = len(true_indices)
        num_pred = len(pred_indices)
        stats['true_tag_counts'][min(3, num_true)] += 1
        stats['pred_tag_counts'][min(3, num_pred)] += 1

        # Calculate intersection
        intersection = true_indices.intersection(pred_indices)
        num_matches = len(intersection)

        # Update match distribution
        stats['match_distribution'][num_matches] += 1

        # Update match counters
        if num_matches >= 1:
            stats['at_least_one_match'] += 1
        if num_matches >= 2:
            stats['at_least_two_matches'] += 1
            if len(examples['two_matches']) < 3:  # Store up to 3 examples
                examples['two_matches'].append({
                    'text': test_texts[i],
                    'true_tags': [tag_names[j] for j in true_indices],
                    'pred_tags': [(tag_names[j], probabilities[i][j]) for j in pred_indices],
                    'matches': [tag_names[j] for j in intersection]
                })
        if num_matches == 3:
            stats['exact_three_matches'] += 1
            if len(examples['three_matches']) < 3:
                examples['three_matches'].append({
                    'text': test_texts[i],
                    'true_tags': [tag_names[j] for j in true_indices],
                    'pred_tags': [(tag_names[j], probabilities[i][j]) for j in pred_indices],
                    'matches': [tag_names[j] for j in intersection]
                })
        if num_matches == 0 and len(examples['no_matches']) < 3:
            examples['no_matches'].append({
                'text': test_texts[i],
                'true_tags': [tag_names[j] for j in true_indices],
                'pred_tags': [(tag_names[j], probabilities[i][j]) for j in pred_indices]
            })

    # Print comprehensive report
    print("\nTAG MATCHING ANALYSIS")
    print("=" * 80)

    print("\n1. Overall Matching Statistics:")
    print(f"Total samples: {total_samples}")
    print(f"At least one matching tag: {stats['at_least_one_match']} ({stats['at_least_one_match']/total_samples*100:.1f}%)")
    print(f"At least two matching tags: {stats['at_least_two_matches']} ({stats['at_least_two_matches']/total_samples*100:.1f}%)")
    print(f"Exact three matching tags: {stats['exact_three_matches']} ({stats['exact_three_matches']/total_samples*100:.1f}%)")

    print("\n2. Match Distribution:")
    print(f"{'Matches':<15} {'Count':>10} {'Percentage':>12}")
    print("-" * 40)
    for matches, count in stats['match_distribution'].items():
        print(f"{f'{matches} tags':<15} {count:>10} {count/total_samples*100:>11.1f}%")

    print("\n3. True Tags Distribution:")
    print(f"{'True Tags':<15} {'Count':>10} {'Percentage':>12}")
    print("-" * 40)
    for tags, count in stats['true_tag_counts'].items():
        print(f"{f'{tags} tags':<15} {count:>10} {count/total_samples*100:>11.1f}%")

    print("\n4. Predicted Tags Distribution:")
    print(f"{'Pred Tags':<15} {'Count':>10} {'Percentage':>12}")
    print("-" * 40)
    for tags, count in stats['pred_tag_counts'].items():
        print(f"{f'{tags} tags':<15} {count:>10} {count/total_samples*100:>11.1f}%")

    print("\n5. Example Cases:")

    if examples['three_matches']:
        print("\nPerfect Match Examples (3 matches):")
        for ex in examples['three_matches'][:2]:
            print("\nText:", ex['text'][:100], "...")
            print("True tags:", ", ".join(ex['true_tags']))
            print("Predicted tags:")
            for tag, prob in ex['pred_tags']:
                print(f"- {tag} (confidence: {prob:.2f})")

    if examples['two_matches']:
        print("\nPartial Match Examples (2 matches):")
        for ex in examples['two_matches'][:2]:
            print("\nText:", ex['text'][:100], "...")
            print("True tags:", ", ".join(ex['true_tags']))
            print("Predicted tags:")
            for tag, prob in ex['pred_tags']:
                print(f"- {tag} (confidence: {prob:.2f})")
            print("Matching tags:", ", ".join(ex['matches']))

    if examples['no_matches']:
        print("\nNo Match Examples:")
        for ex in examples['no_matches'][:2]:
            print("\nText:", ex['text'][:100], "...")
            print("True tags:", ", ".join(ex['true_tags']))
            print("Predicted tags:")
            for tag, prob in ex['pred_tags']:
                print(f"- {tag} (confidence: {prob:.2f})")

    return stats

# Run the analysis
print("\nEvaluating tag matching...")
predictions_binary, probabilities = predict_top3_tags(test_texts, model, tokenizer, threshold=0.1)
matching_stats = analyze_tag_matches(test_labels, predictions_binary, probabilities, tag_names)

In [None]:
#### orignial

In [None]:
# Prepare labels
labels = df[['student_difficult', 'student_star', 'gives_good_feedback', 'caring', 'respected']].values

# Split the data
X_train, X_test, y_train, y_test = train_test_split(df['comments'], labels, test_size=0.2, random_state=42)

# Create datasets
train_dataset = CustomDataset(X_train.tolist(), y_train.tolist())
test_dataset = CustomDataset(X_test.tolist(), y_test.tolist())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]



In [None]:
# Define the model (5 outputs: 2 for regression, 3 for classification)
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=5)


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Custom loss function
def custom_loss(outputs, labels):
    mse_loss = torch.nn.MSELoss()
    bce_loss = torch.nn.BCEWithLogitsLoss()

    # MSE for difficulty and rating (first two outputs)
    reg_loss = mse_loss(outputs[:, :2], labels[:, :2])

    # BCE for the three tags (last three outputs)
    class_loss = bce_loss(outputs[:, 2:], labels[:, 2:])

    # Combine losses (you can adjust the weights if needed)
    total_loss = reg_loss + class_loss
    return total_loss

# Custom Trainer
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        loss = custom_loss(outputs.logits, labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Create Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train the model
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
500,3.2243
1000,1.682


TrainOutput(global_step=1140, training_loss=2.331558455082408, metrics={'train_runtime': 1356.598, 'train_samples_per_second': 6.714, 'train_steps_per_second': 0.84, 'total_flos': 2396480041562112.0, 'train_loss': 2.331558455082408, 'epoch': 3.0})

In [None]:
# Save the model
output_dir = '/content/finetuned_model'
model.save_pretrained(output_dir)
# trainer.tokenizer.save_pretrained(output_dir)

print(f"Model saved to {output_dir}")


Model saved to /content/finetuned_model


In [None]:
# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# Make predictions
predictions = trainer.predict(test_dataset)
predicted_values = predictions.predictions

# Separate predictions
predicted_difficulty = predicted_values[:, 0]
predicted_rating = predicted_values[:, 1]
predicted_tags = (predicted_values[:, 2:] > 0.5).astype(int)

# Calculate MSE for difficulty and rating
difficulty_mse = mean_squared_error(y_test[:, 0], predicted_difficulty)
rating_mse = mean_squared_error(y_test[:, 1], predicted_rating)
print(f"Difficulty Mean Squared Error: {difficulty_mse}")
print(f"Rating Mean Squared Error: {rating_mse}")

Evaluation results: {'eval_loss': 1.6852487325668335, 'eval_runtime': 24.9748, 'eval_samples_per_second': 30.431, 'eval_steps_per_second': 3.804, 'epoch': 3.0}
Difficulty Mean Squared Error: 1.2935267965562556
Rating Mean Squared Error: 0.8844834371420222


In [None]:
# Calculate accuracy and print classification report for tags
tags = ['gives_good_feedback', 'caring', 'respected']
for i, tag in enumerate(tags):
    accuracy = accuracy_score(y_test[:, i+2], predicted_tags[:, i])
    print(f"\n{tag} Accuracy: {accuracy}")
    print(classification_report(y_test[:, i+2], predicted_tags[:, i]))


gives_good_feedback Accuracy: 0.7105263157894737
              precision    recall  f1-score   support

         0.0       0.71      1.00      0.83       540
         1.0       0.00      0.00      0.00       220

    accuracy                           0.71       760
   macro avg       0.36      0.50      0.42       760
weighted avg       0.50      0.71      0.59       760


caring Accuracy: 0.7157894736842105
              precision    recall  f1-score   support

         0.0       0.72      1.00      0.83       544
         1.0       0.00      0.00      0.00       216

    accuracy                           0.72       760
   macro avg       0.36      0.50      0.42       760
weighted avg       0.51      0.72      0.60       760


respected Accuracy: 0.6986842105263158
              precision    recall  f1-score   support

         0.0       0.70      1.00      0.82       531
         1.0       0.00      0.00      0.00       229

    accuracy                           0.70       760
 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
