In [None]:
import numpy as np
from scipy.optimize import linprog

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import hamming_loss, accuracy_score
from skmultilearn.problem_transform import BinaryRelevance, ClassifierChain, LabelPowerset
from skmultilearn.adapt import MLkNN
from skmultilearn.dataset import load_dataset
import pulp


In [None]:
def mmo(X, y):
    selected_samples = []
    N, num_features = X.shape
    _, M = y.shape

    # Current label counts
    current_label_counts = np.sum(y, axis=0)
    print("Label counts before oversampling:\n", current_label_counts)

    # Target label counts
    T = np.max(current_label_counts)
    samples_needed_per_label = T - current_label_counts

    # Initialize resampled datasets
    X_resampled = X.copy()
    y_resampled = y.copy()

    # Initialize sample costs
    sample_costs = np.zeros(N, dtype=int)

    # Keep track of how many times each sample has been selected
    sample_selection_counts = np.zeros(N, dtype=int)

    # While there are labels that need more samples
    while np.any(samples_needed_per_label > 0):
        # List to hold candidate samples and their scores
        candidates = []

        for i in range(N):
            sample_labels = y[i]
            # Potential label counts if we add this sample
            potential_counts = current_label_counts + sample_labels
            # Check if adding this sample exceeds any target counts
            if np.any(potential_counts > T):
                continue
            # Calculate the contribution
            contribution = np.minimum(samples_needed_per_label, sample_labels).sum()
            if contribution > 0:
                # Calculate the score
                score = (sample_costs[i] + 1) / contribution
                candidates.append((score, i))

            else:
                print(np.minimum(samples_needed_per_label, sample_labels))

        if not candidates:
            print("Cannot balance further without exceeding label counts.")
            break

        # Select the sample with the lowest score
        candidates.sort()
        best_score, best_idx = candidates[0]

        # Add the sample
        X_resampled = np.vstack((X_resampled, X[best_idx:best_idx+1]))
        y_resampled = np.vstack((y_resampled, y[best_idx:best_idx+1]))

        # Update counts
        current_label_counts += y[best_idx]
        samples_needed_per_label = T - current_label_counts

        # Update sample cost and selection count
        sample_selection_counts[best_idx] += 1
        sample_costs[best_idx] = sample_selection_counts[best_idx]

        selected_samples.append(best_idx)

    print("Label counts after oversampling:\n", current_label_counts)
    return X_resampled, y_resampled, selected_samples, sample_costs

In [None]:
from skmultilearn.dataset import load_dataset
import numpy as np

# List of datasets to load
datasets = [
    'yeast', 'scene', 'emotions', 'bibtex', 'birds', 'cal500',
    'corel5k', 'delicious', 'enron', 'genbase', 'mediamill',
    'medical', 'tmc2007_500'
]

# Dictionary to store the datasets
data_dict = {}

for dataset in datasets:
    try:
        # Load training data
        X_train, y_train, _, _ = load_dataset(dataset, 'train')
        # Load test data
        X_test, y_test, _, _ = load_dataset(dataset, 'test')
        
        # Store in dictionary
        data_dict[dataset] = {
            'X_train': X_train,
            'y_train': y_train,
            'X_test': X_test,
            'y_test': y_test
        }
        
        print(f"Successfully loaded {dataset} dataset.")
    except Exception as e:
        print(f"Error loading {dataset} dataset: {e}")

# Example: Accessing the 'yeast' dataset
X_train_yeast = data_dict['yeast']['X_train']
y_train_yeast = data_dict['yeast']['y_train']
X_test_yeast = data_dict['yeast']['X_test']
y_test_yeast = data_dict['yeast']['y_test']

# Verify the shape of the yeast dataset
print(f"Yeast dataset - X_train shape: {X_train_yeast.shape}, y_train shape: {y_train_yeast.shape}")

In [None]:
X_train = np.asarray(data_dict['yeast']['X_train'].todense())
y_train = np.asarray(data_dict['yeast']['y_train'].todense())
X_test = np.asarray(data_dict['yeast']['X_test'].todense())
y_test = np.asarray(data_dict['yeast']['y_test'].todense())


# Normalize features to [0, 1] range
scaler = MinMaxScaler()
X = scaler.fit_transform(X_train)

# Initialize the Binary Relevance classifier with Random Forest
classifier = BinaryRelevance(classifier=RandomForestClassifier(n_estimators=100, random_state=42))

# Train the classifier
classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = classifier.predict(X_test)

# Evaluate the classifier
hamming = hamming_loss(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print(f'Hamming Loss: {hamming}')
print(f'Accuracy: {accuracy}')

In [None]:
# Evaluate the classifier
hamming = hamming_loss(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print(f'Hamming Loss: {hamming}')
print(f'Accuracy: {accuracy}')

In [None]:
X_oversample, y_oversample, _, _ = mmo(X_train, y_train)