<a href="https://colab.research.google.com/github/MarcosRigal/CNC/blob/main/P2/P2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install git+https://github.com/garydoranjr/misvm.git
!pip install numpy scikit-learn scipy tensorflow==2.12.0 mil

In [None]:
# importing all the datasets modules
from mil.data.datasets import loader

# load the datasets
(corel_dogs_bags_train, corel_dogs_y_train), (corel_dogs_bags_test, corel_dogs_y_test) = loader.load_data('/usr/local/lib/python3.11/dist-packages/mil/data/datasets/csv/corel_dogs.csv')
(musk1_bags_train, musk1_y_train), (musk1_bags_test, musk1_y_test) = loader.load_data('/usr/local/lib/python3.11/dist-packages/mil/data/datasets/csv/musk1.csv')
(elephant_bags_train, elephant_y_train), (elephant_bags_test, elephant_y_test) = loader.load_data('/usr/local/lib/python3.11/dist-packages/mil/data/datasets/csv/elephant.csv')

In [None]:
import misvm
classifier = misvm.MISVM(kernel='linear', C=1.0, max_iters=50)
classifier.fit(musk1_bags_train, musk1_y_train)
labels = classifier.predict(musk1_bags_test)


In [None]:
# importing bag_representation
from mil.bag_representation import MILESMapping
# importing validation strategy
from mil.validators import LeaveOneOut
# importing final model, which in this case is the SVC classifier from sklearn
from mil.models import SVC
# importing trainer
from mil.trainer import Trainer
# importing preprocessing
from mil.preprocessing import StandarizerBagsList
# importing metrics, which in this case are from tf keras metrics
from mil.metrics import AUC

# instantiate trainer
trainer = Trainer()

# preparing trainer
metrics = ['acc', AUC]
model = SVC(kernel='linear', C=1, class_weight='balanced')
pipeline = [('scale', StandarizerBagsList()), ('disc_mapping', MILESMapping())]
trainer.prepare(model, preprocess_pipeline=pipeline ,metrics=metrics)

# fitting trainer
valid = LeaveOneOut()
history = trainer.fit(musk1_bags_train, musk1_y_train, sample_weights='balanced', validation_strategy=valid, verbose=1)

# printing validation results for each fold
print(history['metrics_val'])

# predicting metrics for the test set
trainer.predict_metrics(musk1_bags_test, musk1_y_test)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from mil.data.datasets import loader
import misvm
from mil.bag_representation import MILESMapping
from mil.validators import LeaveOneOut
from mil.models import SVC
from mil.trainer import Trainer
from mil.preprocessing import StandarizerBagsList
from mil.metrics import AUC

# Step 1: Load all datasets
print("Loading datasets...")
# Load Musk1 dataset
(musk1_bags_train, musk1_y_train), (musk1_bags_test, musk1_y_test) = loader.load_data('/usr/local/lib/python3.11/dist-packages/mil/data/datasets/csv/musk1.csv')
# Load Elephant dataset
(elephant_bags_train, elephant_y_train), (elephant_bags_test, elephant_y_test) = loader.load_data('/usr/local/lib/python3.11/dist-packages/mil/data/datasets/csv/elephant.csv')
# Load Corel Dogs dataset
(corel_dogs_bags_train, corel_dogs_y_train), (corel_dogs_bags_test, corel_dogs_y_test) = loader.load_data('/usr/local/lib/python3.11/dist-packages/mil/data/datasets/csv/corel_dogs.csv')

# Step 2: Define function to evaluate MISVM algorithm
def evaluate_misvm(bags_train, y_train, bags_test, y_test, kernel='linear', C=1.0, max_iters=50):
    """
    Evaluate the MISVM algorithm on a dataset

    Returns:
        accuracy, f1_score, confusion_matrix
    """
    print(f"Training MISVM with kernel={kernel}, C={C}...")
    classifier = misvm.MISVM(kernel=kernel, C=C, max_iters=max_iters)
    classifier.fit(bags_train, y_train)
    y_pred = classifier.predict(bags_test)

    # Ensure predictions are binary (0 or 1)
    y_pred_binary = np.array([1 if pred > 0 else 0 for pred in y_pred])
    y_test_binary = np.array([1 if y > 0 else 0 for y in y_test])

    # Calculate metrics
    acc = accuracy_score(y_test_binary, y_pred_binary)
    f1 = f1_score(y_test_binary, y_pred_binary, zero_division=0)
    cm = confusion_matrix(y_test_binary, y_pred_binary)

    return acc, f1, cm

# Step 3: Define function to evaluate MILES algorithm
def evaluate_miles(bags_train, y_train, bags_test, y_test, kernel='linear', C=1.0):
    """
    Evaluate the MILES algorithm on a dataset

    Returns:
        accuracy, auc
    """
    print(f"Training MILES with kernel={kernel}, C={C}...")
    # Set up trainer
    trainer = Trainer()
    metrics = ['acc', AUC]
    model = SVC(kernel=kernel, C=C, class_weight='balanced')
    pipeline = [('scale', StandarizerBagsList()), ('disc_mapping', MILESMapping())]
    trainer.prepare(model, preprocess_pipeline=pipeline, metrics=metrics)

    # Convert labels to ensure they're binary (0 or 1)
    y_train_binary = np.array([1 if y > 0 else 0 for y in y_train])
    y_test_binary = np.array([1 if y > 0 else 0 for y in y_test])

    # Train with LOO validation
    valid = LeaveOneOut()
    try:
        history = trainer.fit(bags_train, y_train_binary, sample_weights='balanced', validation_strategy=valid, verbose=0)
        # Test metrics
        result = trainer.predict_metrics(bags_test, y_test_binary)
        # Return accuracy and AUC
        return result['acc'], result.get('auc', 0.5)  # Default AUC to 0.5 if not available
    except Exception as e:
        print(f"Error training MILES: {e}")
        # Return default values
        return 0.0, 0.5

def evaluate_misvm(bags_train, y_train, bags_test, y_test, kernel='linear', C=1.0):
    """
    Evaluate the MISVM algorithm on a dataset

    Returns:
        accuracy, f1_score, confusion_matrix
    """
    print(f"Training MISVM with kernel={kernel}, C={C}...")
    classifier = misvm.MISVM(kernel=kernel, C=C)

    # Ensure labels are binary (0 or 1)
    y_train_binary = np.array([1 if y > 0 else 0 for y in y_train])
    y_test_binary = np.array([1 if y > 0 else 0 for y in y_test])

    try:
        classifier.fit(bags_train, y_train_binary)
        y_pred = classifier.predict(bags_test)

        # Ensure predictions are binary (0 or 1)
        y_pred_binary = np.array([1 if pred > 0 else 0 for pred in y_pred])

        # Calculate metrics
        acc = accuracy_score(y_test_binary, y_pred_binary)
        f1 = f1_score(y_test_binary, y_pred_binary, zero_division=0)
        cm = confusion_matrix(y_test_binary, y_pred_binary)

        return acc, f1, cm
    except Exception as e:
        print(f"Error training MISVM: {e}")
        # Return default values
        return 0.0, 0.0, np.array([[0, 0], [0, 0]])

print("\nEvaluating algorithms on all datasets...")
misvm_musk1 = evaluate_misvm(musk1_bags_train, musk1_y_train, musk1_bags_test, musk1_y_test, kernel='rbf', C=10.0)
misvm_elephant = evaluate_misvm(elephant_bags_train, elephant_y_train, elephant_bags_test, elephant_y_test, kernel='rbf', C=10.0)
misvm_corel_dogs = evaluate_misvm(corel_dogs_bags_train, corel_dogs_y_train, corel_dogs_bags_test, corel_dogs_y_test, kernel='rbf', C=10.0)
miles_musk1 = evaluate_miles(musk1_bags_train, musk1_y_train, musk1_bags_test, musk1_y_test, kernel='linear', C=1.0)
miles_elephant = evaluate_miles(elephant_bags_train, elephant_y_train, elephant_bags_test, elephant_y_test, kernel='linear', C=1.0)
miles_corel_dogs = evaluate_miles(corel_dogs_bags_train, corel_dogs_y_train, corel_dogs_bags_test, corel_dogs_y_test, kernel='linear', C=1.0)

# Step 6: Visualize Results
def plot_confusion_matrix(cm, title):
    """Plot a confusion matrix"""
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(title)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

print("\nPlotting confusion matrices...")
plot_confusion_matrix(misvm_musk1[2], 'MISVM Musk1 Confusion Matrix')
plot_confusion_matrix(misvm_elephant[2], 'MISVM Elephant Confusion Matrix')
plot_confusion_matrix(misvm_corel_dogs[2], 'MISVM Corel Dogs Confusion Matrix')

# Step 7: Compare Metrics
def compare_misvm_miles(misvm_results, miles_results, dataset_name):
    """Compare metrics between algorithms"""
    print(f"\nDataset: {dataset_name}")
    print(f"MISVM Accuracy: {misvm_results[0]:.4f}, F1 Score: {misvm_results[1]:.4f}")
    print(f"MILES Accuracy: {miles_results[0]:.4f}, AUC: {miles_results[1]:.4f}")

# Compare metrics
compare_misvm_miles(misvm_musk1, miles_musk1, 'Musk1')
compare_misvm_miles(misvm_elephant, miles_elephant, 'Elephant')
compare_misvm_miles(misvm_corel_dogs, miles_corel_dogs, 'Corel Dogs')

# Step 8: Create a bar chart to compare algorithm performance
def plot_comparison(datasets, misvm_metrics, miles_metrics, metric_name='Accuracy'):
    """Create a bar chart comparing algorithm performance"""
    misvm_values = [m[0] for m in misvm_metrics]  # Use accuracy for MISVM
    miles_values = [m[0] for m in miles_metrics]  # Use accuracy for MILES

    x = np.arange(len(datasets))
    width = 0.35

    fig, ax = plt.subplots(figsize=(10, 6))
    ax.bar(x - width/2, misvm_values, width, label='MISVM')
    ax.bar(x + width/2, miles_values, width, label='MILES')

    ax.set_ylabel(metric_name)
    ax.set_title(f'{metric_name} by Algorithm and Dataset')
    ax.set_xticks(x)
    ax.set_xticklabels(datasets)
    ax.legend()

    plt.ylim(0, 1)
    plt.show()

# Plot accuracy comparison
print("\nPlotting accuracy comparison...")
plot_comparison(['Musk1', 'Elephant', 'Corel Dogs'],
                [misvm_musk1, misvm_elephant, misvm_corel_dogs],
                [miles_musk1, miles_elephant, miles_corel_dogs],
                'Accuracy')

# Step 9: Generate summary of findings
print("\n=== SUMMARY OF FINDINGS ===")
print("1. Algorithm Performance Comparison:")
print("   - MISVM:")
musk1_acc, elephant_acc, corel_acc = misvm_musk1[0], misvm_elephant[0], misvm_corel_dogs[0]
print(f"     Musk1: {musk1_acc:.4f}, Elephant: {elephant_acc:.4f}, Corel Dogs: {corel_acc:.4f}")
print(f"     Average accuracy: {np.mean([musk1_acc, elephant_acc, corel_acc]):.4f}")

print("   - MILES:")
musk1_acc, elephant_acc, corel_acc = miles_musk1[0], miles_elephant[0], miles_corel_dogs[0]
print(f"     Musk1: {musk1_acc:.4f}, Elephant: {elephant_acc:.4f}, Corel Dogs: {corel_acc:.4f}")
print(f"     Average accuracy: {np.mean([musk1_acc, elephant_acc, corel_acc]):.4f}")

print("\n2. Dataset Difficulty Analysis:")
datasets = ['Musk1', 'Elephant', 'Corel Dogs']
misvm_accs = [misvm_musk1[0], misvm_elephant[0], misvm_corel_dogs[0]]
miles_accs = [miles_musk1[0], miles_elephant[0], miles_corel_dogs[0]]
avg_accs = [(m + n)/2 for m, n in zip(misvm_accs, miles_accs)]

# Sort datasets by difficulty (lower accuracy = more difficult)
difficulty_order = np.argsort(avg_accs)
for i in difficulty_order:
    print(f"   - {datasets[i]}: Average accuracy {avg_accs[i]:.4f}")

print("\n3. Conclusions:")
if np.mean(misvm_accs) > np.mean(miles_accs):
    print("   - MISVM performed better overall, suggesting it's more suitable for these datasets.")
elif np.mean(misvm_accs) < np.mean(miles_accs):
    print("   - MILES performed better overall, suggesting it's more suitable for these datasets.")
else:
    print("   - Both algorithms performed similarly overall, but showed different strengths on specific datasets.")

print("   - The datasets vary in difficulty, with some datasets being more challenging for both algorithms.")
print("   - Hyperparameter tuning and additional feature engineering could potentially improve results further.")

Loading datasets...

Evaluating algorithms on all datasets...
Training MISVM with kernel=rbf, C=10.0...
Non-random start...

Iteration 1...
Training SVM...
     pcost       dcost       gap    pres   dres
 0: -2.7928e+01 -1.0764e+01  7e+02  3e+01  2e-16
 1: -5.0527e+00 -6.6788e+00  2e+01  7e-01  9e-16
 2: -3.0290e+00 -5.0573e+00  2e+00  5e-16  4e-16
 3: -3.1239e+00 -3.2443e+00  1e-01  6e-16  2e-16
 4: -3.1245e+00 -3.1284e+00  4e-03  9e-16  2e-16
 5: -3.1250e+00 -3.1253e+00  2e-04  6e-16  2e-16
 6: -3.1251e+00 -3.1251e+00  2e-06  8e-17  5e-17
Optimal solution found.
Recomputing classes...
Selector differences: 227
Updating QP...

Iteration 2...
Training SVM...
     pcost       dcost       gap    pres   dres
 0: -2.7928e+01 -1.0764e+01  7e+02  3e+01  2e-16
 1: -5.0527e+00 -6.6788e+00  2e+01  7e-01  9e-16
 2: -3.0290e+00 -5.0573e+00  2e+00  5e-16  4e-16
 3: -3.1239e+00 -3.2443e+00  1e-01  6e-16  2e-16
 4: -3.1245e+00 -3.1284e+00  4e-03  9e-16  2e-16
 5: -3.1250e+00 -3.1253e+00  2e-04  6e-1