In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import fetch_openml
from tqdm import tqdm
!pip install scipy
from scipy.stats import skew, kurtosis # Import skew and kurtosis from scipy.stats





In [None]:
def load_meta_knowledge_base(file_path='meta_knowledge_base.csv'):
    if os.path.exists(file_path) and os.path.getsize(file_path) > 0:
        return pd.read_csv(file_path)
    else:
        meta_feature_names = ['n_samples', 'num_features', 'n_classes',
                              'nominal_count', 'numeric_count', 'class_entropy','mean_numeric','stddev_numeric','skewness_numeric','kurtosis_numeric']

        columns = meta_feature_names + ['DecisionTree', 'SVM', 'RandomForest', 'NaiveBayes']
        return pd.DataFrame(columns=columns)

In [None]:
def save_meta_knowledge_base(meta_knowledge_base, file_path='meta_knowledge_base.csv'):
    """
    Saves the meta-knowledge base to a CSV file.

    Args:
        meta_knowledge_base: The meta-knowledge base DataFrame to save.
        file_path: The path to the file where the data will be saved.
            Defaults to 'meta_knowledge_base.csv'.
    """
    try:
        meta_knowledge_base.to_csv(file_path, index=False)
        print(f"Meta-knowledge base saved to {file_path}")
    except Exception as e:
        print(f"Error saving meta-knowledge base: {e}")

In [None]:
def fetch_dataset(dataset_id):
    dataset = fetch_openml(data_id=dataset_id)
    X, y = dataset.data, dataset.target

    # Encode target labels if necessary
    y = encode_labels(y)

    return X, y


def encode_labels(y):
    if y.dtype == 'object' or isinstance(y[0], str):  # If y contains strings
        le = LabelEncoder()
        y = le.fit_transform(y)
    return y





In [None]:
def extract_meta_features(X, y):
    numeric_features = X.select_dtypes(include=[np.number])
    nominal_features = X.select_dtypes(exclude=[np.number])

    meta_features = {
        'n_samples': X.shape[0],
        'n_features': X.shape[1],
        'n_classes': len(np.unique(y)),
        'nominal_count': nominal_features.shape[1],
        'numeric_count': numeric_features.shape[1],
        'class_entropy': entropy(y),
        'mean_numeric': numeric_features.mean().mean() if not numeric_features.empty else 0,
        'stddev_numeric': numeric_features.std().mean() if not numeric_features.empty else 0,
        'skewness_numeric': skew(numeric_features, nan_policy='omit').mean() if not numeric_features.empty else 0,
        'kurtosis_numeric': kurtosis(numeric_features, nan_policy='omit').mean() if not numeric_features.empty else 0,
    }
    return meta_features


# Entropy calculation with label encoding
def entropy(y):
    y = encode_labels(y)
    hist = np.bincount(y)
    ps = hist / len(y)
    return -np.sum([p * np.log2(p) for p in ps if p > 0])


In [None]:
# Uncertainty Sampling Method
def uncertainty_sampling(meta_knowledge_base, new_meta_features):
    if meta_knowledge_base.empty:
        print("Meta-knowledge base is empty. Skipping uncertainty sampling.")
        return None

    # Calculate uncertainty for each dataset in the meta-knowledge base
    meta_knowledge_base['uncertainty'] = meta_knowledge_base.apply(
        lambda row: np.abs(row['class_entropy'] - new_meta_features['class_entropy']), axis=1)

    # Select the dataset with the highest uncertainty
    most_uncertain_dataset = meta_knowledge_base.loc[meta_knowledge_base['uncertainty'].idxmax()]
    return most_uncertain_dataset

In [None]:
# Models to evaluate
models = {
    'DecisionTree': DecisionTreeClassifier(),
    'SVM': SVC(),
    'RandomForest': RandomForestClassifier(),
    'NaiveBayes': GaussianNB()
}

def preprocess_data(X):
    X.replace('none', np.nan, inplace=True)

    for col in X.select_dtypes(include=[np.number]).columns:
        X[col].fillna(X[col].mean(), inplace=True)



    X_encoded = pd.get_dummies(X, columns=X.select_dtypes(include=['object']).columns)

    return X_encoded

def evaluate_models(X, y, models):
    results = {}

    X_processed = preprocess_data(X)

    for name, model in models.items():
        # Perform cross-validation
        scores = cross_val_score(model, X_processed, y, cv=5, scoring='accuracy', error_score='raise')
        results[name] = scores.mean()

    return results

def knowledge_acquisition(dataset_ids, meta_knowledge_base):
    for dataset_id in tqdm(dataset_ids, desc="Processing datasets"):  # Wrap dataset_ids with tqdm
        X, y = fetch_dataset(dataset_id)

        # Handle potential errors during feature extraction or model evaluation
        try:
          # Perform uncertainty sampling on current meta-knowledge base
            meta_features = extract_meta_features(X, y)
            most_uncertain_dataset = uncertainty_sampling(meta_knowledge_base, meta_features)
            if most_uncertain_dataset is None:
              print("No uncertain dataset found, evaluating the new dataset directly.")
            else:
              print(f"Most uncertain dataset selected: {most_uncertain_dataset}")


            performance = evaluate_models(X, y, models)
            meta_knowledge_base = pd.concat([meta_knowledge_base, pd.DataFrame([{**meta_features, **performance}])], ignore_index=True)
        except (TypeError, ValueError) as e:
            print(f"Error processing dataset {dataset_id}: {e}")

    return meta_knowledge_base

def success_rate_ratio(meta_knowledge_base):
    algorithms = ['DecisionTree', 'SVM', 'RandomForest', 'NaiveBayes']
    srr = {}

    for algo in algorithms:
        for other_algo in algorithms:
            if algo != other_algo:
                srr[algo] = (1 - meta_knowledge_base[algo].mean()) / (1 - meta_knowledge_base[other_algo].mean())

    # Return sorted SRR values for ranking
    return sorted(srr.items(), key=lambda x: x[1], reverse=True)

In [None]:
def rank_models_srr(X, y, meta_knowledge_base):
    meta_features = extract_meta_features(X, y)
    relevant_dataset = uncertainty_sampling(meta_knowledge_base, meta_features)

    # Calculate SRR for ranking classifiers
    rankings = success_rate_ratio(meta_knowledge_base)

    return rankings


In [None]:
meta_knowledge_base = load_meta_knowledge_base()


def run_iteration(new_dataset_ids,meta_knowledge_base):

    # Perform knowledge acquisition with new datasets
    meta_knowledge_base = knowledge_acquisition(new_dataset_ids, meta_knowledge_base)

    # Save the updated meta-knowledge base
    save_meta_knowledge_base(meta_knowledge_base)

    return meta_knowledge_base



In [None]:
dataset_ids_batch1 = [1504, 1461]
meta_knowledge_base = run_iteration(dataset_ids_batch1 ,meta_knowledge_base)
#steel_platesfault
#bank_marketing

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(X[col].mean(), inplace=True)


Most uncertain dataset selected: n_samples           214.000000
num_features               NaN
n_classes             6.000000
nominal_count         0.000000
numeric_count         9.000000
class_entropy         2.176534
mean_numeric         11.265852
stddev_numeric        0.689541
skewness_numeric      1.640988
kurtosis_numeric      9.650639
DecisionTree          0.682281
SVM                   0.355150
RandomForest          0.771318
NaiveBayes            0.439646
n_features            9.000000
uncertainty           1.245422
Name: 5, dtype: float64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(X[col].mean(), inplace=True)
Processing datasets: 100%|██████████| 2/2 [00:06<00:00,  3.23s/it]

Most uncertain dataset selected: n_samples           214.000000
num_features               NaN
n_classes             6.000000
nominal_count         0.000000
numeric_count         9.000000
class_entropy         2.176534
mean_numeric         11.265852
stddev_numeric        0.689541
skewness_numeric      1.640988
kurtosis_numeric      9.650639
DecisionTree          0.682281
SVM                   0.355150
RandomForest          0.771318
NaiveBayes            0.439646
n_features            9.000000
uncertainty           1.655903
Name: 5, dtype: float64
Error processing dataset 1461: could not convert string to float: 'retired'
Meta-knowledge base saved to meta_knowledge_base.csv





In [None]:
dataset_ids_batch2 = [42, 1063 ,40]
meta_knowledge_base = run_iteration(dataset_ids_batch2, meta_knowledge_base)
#soybean
#softwaredefectprediction
#sonarsignalclassification

  X.replace('none', np.nan, inplace=True)
Processing datasets:  33%|███▎      | 1/3 [00:00<00:00,  9.97it/s]

Most uncertain dataset selected: n_samples            522.000000
num_features                NaN
n_classes              2.000000
nominal_count          0.000000
numeric_count         21.000000
class_entropy          0.731779
mean_numeric         977.893699
stddev_numeric      5875.729588
skewness_numeric       8.927345
kurtosis_numeric     128.854047
DecisionTree           0.735568
SVM                    0.827418
RandomForest           0.800659
NaiveBayes             0.831190
n_features            21.000000
uncertainty            3.103729
Name: 1, dtype: float64
Error processing dataset 42: could not convert string to float: 'october'
Most uncertain dataset selected: n_samples           214.000000
num_features               NaN
n_classes             6.000000
nominal_count         0.000000
numeric_count         9.000000
class_entropy         2.176534
mean_numeric         11.265852
stddev_numeric        0.689541
skewness_numeric      1.640988
kurtosis_numeric      9.650639
DecisionTree  

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(X[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(X[col].mean(), inplace=True)


Most uncertain dataset selected: n_samples           214.000000
num_features               NaN
n_classes             6.000000
nominal_count         0.000000
numeric_count         9.000000
class_entropy         2.176534
mean_numeric         11.265852
stddev_numeric        0.689541
skewness_numeric      1.640988
kurtosis_numeric      9.650639
DecisionTree          0.682281
SVM                   0.355150
RandomForest          0.771318
NaiveBayes            0.439646
n_features            9.000000
uncertainty           1.179804
Name: 5, dtype: float64


Processing datasets: 100%|██████████| 3/3 [00:02<00:00,  1.23it/s]

Meta-knowledge base saved to meta_knowledge_base.csv





In [None]:
dataset_ids_batch3 = [1464, 44]
meta_knowledge_base = run_iteration(dataset_ids_batch3, meta_knowledge_base)
#binary classification
# Blood transfusion , email spam

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(X[col].mean(), inplace=True)


Most uncertain dataset selected: n_samples           214.000000
num_features               NaN
n_classes             6.000000
nominal_count         0.000000
numeric_count         9.000000
class_entropy         2.176534
mean_numeric         11.265852
stddev_numeric        0.689541
skewness_numeric      1.640988
kurtosis_numeric      9.650639
DecisionTree          0.682281
SVM                   0.355150
RandomForest          0.771318
NaiveBayes            0.439646
n_features            9.000000
uncertainty           1.384889
Name: 5, dtype: float64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(X[col].mean(), inplace=True)


Most uncertain dataset selected: n_samples           214.000000
num_features               NaN
n_classes             6.000000
nominal_count         0.000000
numeric_count         9.000000
class_entropy         2.176534
mean_numeric         11.265852
stddev_numeric        0.689541
skewness_numeric      1.640988
kurtosis_numeric      9.650639
DecisionTree          0.682281
SVM                   0.355150
RandomForest          0.771318
NaiveBayes            0.439646
n_features            9.000000
uncertainty           1.209174
Name: 5, dtype: float64


Processing datasets: 100%|██████████| 2/2 [00:12<00:00,  6.03s/it]

Meta-knowledge base saved to meta_knowledge_base.csv





In [None]:
dataset_ids_batch4 = [41, 31]
meta_knowledge_base = run_iteration(dataset_ids_batch4, meta_knowledge_base)
#glass
#credit

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(X[col].mean(), inplace=True)


Most uncertain dataset selected: n_samples            522.000000
num_features                NaN
n_classes              2.000000
nominal_count          0.000000
numeric_count         21.000000
class_entropy          0.731779
mean_numeric         977.893699
stddev_numeric      5875.729588
skewness_numeric       8.927345
kurtosis_numeric     128.854047
DecisionTree           0.735568
SVM                    0.827418
RandomForest           0.800659
NaiveBayes             0.831190
n_features            21.000000
uncertainty            1.444755
Name: 1, dtype: float64


  X.replace('none', np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(X[col].mean(), inplace=True)
Processing datasets: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s]

Most uncertain dataset selected: n_samples           214.000000
num_features               NaN
n_classes             6.000000
nominal_count         0.000000
numeric_count         9.000000
class_entropy         2.176534
mean_numeric         11.265852
stddev_numeric        0.689541
skewness_numeric      1.640988
kurtosis_numeric      9.650639
DecisionTree          0.682281
SVM                   0.355150
RandomForest          0.771318
NaiveBayes            0.439646
n_features            9.000000
uncertainty           1.295243
Name: 5, dtype: float64
Error processing dataset 31: could not convert string to float: 'no checking'
Meta-knowledge base saved to meta_knowledge_base.csv





In [None]:
dataset_ids_batch4 = [40981, 1547]
meta_knowledge_base = run_iteration(dataset_ids_batch4, meta_knowledge_base)
#breastcancer #credit approval

Processing datasets:   0%|          | 0/2 [00:00<?, ?it/s]

Most uncertain dataset selected: n_samples                 214
num_features              NaN
n_classes                   6
nominal_count               0
numeric_count               9
class_entropy        2.176534
mean_numeric        11.265852
stddev_numeric       0.689541
skewness_numeric     1.640988
kurtosis_numeric     9.650639
DecisionTree         0.682281
SVM                   0.35515
RandomForest         0.771318
NaiveBayes           0.439646
n_features                9.0
uncertainty          1.185303
Name: 5, dtype: object


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(X[col].mean(), inplace=True)
Processing datasets:  50%|█████     | 1/2 [00:01<00:01,  1.70s/it]

Error processing dataset 40981: unsupported operand type(s) for /: 'str' and 'int'
Most uncertain dataset selected: n_samples                 214
num_features              NaN
n_classes                   6
nominal_count               0
numeric_count               9
class_entropy        2.176534
mean_numeric        11.265852
stddev_numeric       0.689541
skewness_numeric     1.640988
kurtosis_numeric     9.650639
DecisionTree         0.682281
SVM                   0.35515
RandomForest         0.771318
NaiveBayes           0.439646
n_features                9.0
uncertainty            1.3513
Name: 5, dtype: object


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(X[col].mean(), inplace=True)
Processing datasets: 100%|██████████| 2/2 [00:04<00:00,  2.38s/it]

Meta-knowledge base saved to meta_knowledge_base.csv





In [None]:
dataset_ids_batch5 = [554]
meta_knowledge_base = run_iteration(dataset_ids_batch5, meta_knowledge_base)


Processing datasets:   0%|          | 0/1 [00:00<?, ?it/s]

Most uncertain dataset selected: n_samples                   522
num_features                NaN
n_classes                     2
nominal_count                 0
numeric_count                21
class_entropy          0.731779
mean_numeric         977.893699
stddev_numeric      5875.729588
skewness_numeric       8.927345
kurtosis_numeric     128.854047
DecisionTree           0.735568
SVM                    0.827418
RandomForest           0.800659
NaiveBayes              0.83119
n_features                 21.0
uncertainty            2.588058
Name: 1, dtype: object


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(X[col].mean(), inplace=True)


In [None]:
dataset_ids_batch5 = [61]
meta_knowledge_base = run_iteration(dataset_ids_batch5, meta_knowledge_base)
#iris

Processing datasets:   0%|          | 0/1 [00:00<?, ?it/s]

Most uncertain dataset selected: n_samples            522.000000
num_features                NaN
n_classes              2.000000
nominal_count          0.000000
numeric_count         21.000000
class_entropy          0.731779
mean_numeric         977.893699
stddev_numeric      5875.729588
skewness_numeric       8.927345
kurtosis_numeric     128.854047
DecisionTree           0.735568
SVM                    0.827418
RandomForest           0.800659
NaiveBayes             0.831190
n_features            21.000000
uncertainty            0.853184
Name: 1, dtype: float64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(X[col].mean(), inplace=True)
Processing datasets: 100%|██████████| 1/1 [00:01<00:00,  1.10s/it]

Meta-knowledge base saved to meta_knowledge_base.csv





In [None]:
X_new, y_new = fetch_dataset(44270)  # Example dataset for advisory
model_ranking = rank_models_srr(X_new, y_new, meta_knowledge_base)
print("Model Ranking (using SRR):")
for rank, (model, score) in enumerate(model_ranking, start=1):
    print(f"{rank}. {model} - SRR Score: {score:.4f}")
#turing_course_binary_data

Model Ranking (using SRR):
1. NaiveBayes - SRR Score: 1.5291
2. SVM - SRR Score: 1.0546
3. DecisionTree - SRR Score: 0.7993
4. RandomForest - SRR Score: 0.6540


In [None]:
X_new, y_new = fetch_dataset(44270)
extract_meta_features(X_new,y_new)


{'n_samples': 10000,
 'n_features': 52,
 'n_classes': 2,
 'nominal_count': 2,
 'numeric_count': 50,
 'class_entropy': 0.949357838514423,
 'mean_numeric': -0.001682835857856283,
 'stddev_numeric': 0.9996053541243545,
 'skewness_numeric': 0.01023429371538489,
 'kurtosis_numeric': 0.07973497107585815}