In [1]:
import openml

import sklearn
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.exceptions import ConvergenceWarning

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings

In [2]:
# Suppress only ConvergenceWarnings
warnings.simplefilter('ignore', ConvergenceWarning)

In [3]:
# 47 -> tae 48 
#2079 -> eucalyptus 188
#3902 -> pc4 1049
# 3561 -> profb 470
dataset_ids  = [1483, 180, 1459, 1509, 4538]
results = []

In [4]:
# Define parameter grids for each model
param_grid = {
    "Logistic Regression": {
        'model__C': [0.1, 1, 10],
        'model__solver': ['lbfgs', 'liblinear']
    },
    "Random Forest": {
        'model__n_estimators': [50, 100, 200],
        'model__max_features': [None, 'sqrt', 'log2']
    },
    "SVM": {
        'model__C': [0.1, 1, 10],
        'model__kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    },
    "MLP":{
        #'model__hidden_layer_sizes': [(50,), (100,), (50,50), (100,100)],
        'model__hidden_layer_sizes': [(50,), (100,100)],
        'model__activation': ['tanh', 'relu'],
        'model__solver': ['sgd', 'adam'],
        'model__alpha': [0.0001, 0.05],
        #'model__batch_size': [64, 128, 256],
        #'model__learning_rate': ['constant','adaptive'],
        'model__learning_rate_init': [0.001, 0.01],
    }
}

In [5]:
for dataset_id in dataset_ids:
    # Load the dataset
    dataset = openml.datasets.get_dataset(dataset_id, 
                                          download_data = True, 
                                          download_qualities =True, 
                                          download_features_meta_data=True
                                         )
    print(dataset)

OpenML Dataset
Name..........: ldpa
Version.......: 1
Format........: ARFF
Upload Date...: 2015-05-22 23:34:22
Licence.......: Public
Download URL..: https://api.openml.org/data/v1/download/1590940/ldpa.arff
OpenML URL....: https://www.openml.org/d/1483
# of features.: 8
# of instances: 164860
OpenML Dataset
Name..........: covertype
Version.......: 1
Format........: ARFF
Upload Date...: 2014-04-23 13:14:37
Licence.......: Public
Download URL..: https://api.openml.org/data/v1/download/3615/covertype.arff
OpenML URL....: https://www.openml.org/d/180
# of features.: 55
# of instances: 110393
OpenML Dataset
Name..........: artificial-characters
Version.......: 1
Format........: ARFF
Upload Date...: 2015-05-21 20:58:53
Licence.......: Public
Download URL..: https://api.openml.org/data/v1/download/1586212/artificial-characters.arff
OpenML URL....: https://www.openml.org/d/1459
# of features.: 8
# of instances: 10218
OpenML Dataset
Name..........: walking-activity
Version.......: 1
Format...

In [7]:
# Loop through each dataset ID
dataset_results = None

for dataset_id in dataset_ids:
    # Load the dataset
    dataset = openml.datasets.get_dataset(dataset_id, 
                                          download_data = True, 
                                          download_qualities =True, 
                                          download_features_meta_data=True
                                         )
    print(dataset.name)
    
    X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)
    
    # Create train-test splits
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

    numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist() 

    # Create transformers for the numerical and categorical data
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with the mean
        ('scaler', StandardScaler())  # Scale data
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with the most frequent value
        ('oe', OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = -1)) 
    ])

    # Combine transformers into a ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])
    # Define models to test
    models = {
        "Logistic Regression": Pipeline(steps=[
                ('preprocessor', preprocessor),
                ('model', LogisticRegression())
            ]),
        "Random Forest": Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', RandomForestClassifier())
        ]),
        "SVM": Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', SVC())
        ]),
        "MLP": Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', MLPClassifier())
        ])
    }
    model_results = None
    
    # Loop through each model
    for model_name, model in models.items():
        # Setup the GridSearchCV pipeline
        grid_search = GridSearchCV(model, 
                                   param_grid=param_grid[model_name],
                                   n_jobs=-1,
                                   cv=5, 
                                   scoring='accuracy', 
                                   return_train_score=True,
                                   verbose=2
                                  )
        grid_search.fit(X_train, y_train)

        if model_results is None:
            model_results = pd.DataFrame(grid_search.cv_results_)
            model_results["Model"] = model_name
        else:
            dataframe = pd.DataFrame(grid_search.cv_results_)
            dataframe["Model"] = model_name
            model_results = pd.concat([model_results, dataframe], axis = 0)
        
        # Predict the test set
        y_pred = grid_search.best_estimator_.predict(X_test)
        
        # Calculate the accuracy
        accuracy = accuracy_score(y_test, y_pred)
        
        # Store the results
        results.append({
            'dataset_id': dataset_id,
            'dataset_name': dataset.name,
            'model': model_name,
            'best_params': grid_search.best_params_,
            'accuracy': accuracy
        })

    if dataset_results is None:
        dataset_results = model_results
        dataset_results["Dataset"] = dataset.name
    else:
        dataframe = model_results
        dataframe["Dataset"] = dataset.name
        dataset_results = pd.concat([dataset_results, dataframe], axis = 0)

ldpa
Fitting 5 folds for each of 6 candidates, totalling 30 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] END ..................model__C=0.1, model__solver=lbfgs; total time=  10.6s
[CV] END ..............model__C=0.1, model__solver=liblinear; total time=  11.4s
[CV] END ................model__C=1, model__solver=liblinear; total time=  11.3s
[CV] END ...............model__C=10, model__solver=liblinear; total time=   8.1s
[CV] END ..model__max_features=None, model__n_estimators=100; total time= 1.6min




[CV] END ..................model__C=0.1, model__solver=lbfgs; total time=  11.2s
[CV] END ....................model__C=1, model__solver=lbfgs; total time=  10.9s
[CV] END ................model__C=1, model__solver=liblinear; total time=  11.3s
[CV] END ...............model__C=10, model__solver=liblinear; total time=   8.1s
[CV] END ..model__max_features=None, model__n_estimators=100; total time= 1.6min
[CV] END ..............model__C=0.1, model__solver=liblinear; total time=  11.6s
[CV] END ....................model__C=1, model__solver=lbfgs; total time=  10.6s
[CV] END ................model__C=1, model__solver=liblinear; total time=  11.3s
[CV] END ..model__max_features=None, model__n_estimators=100; total time= 1.7min
[CV] END ..................model__C=0.1, model__solver=lbfgs; total time=  10.8s
[CV] END ..............model__C=0.1, model__solver=liblinear; total time=  11.3s
[CV] END ...................model__C=10, model__solver=lbfgs; total time=  10.8s
[CV] END ...............mode

KeyboardInterrupt: 

In [None]:
dataset_results

In [None]:
for result in results:
    print(f"Dataset ID: {result['dataset_id']} - {result['dataset_name']}\n"
          f"Model: {result['model']}\nBest Params: {result['best_params']}\n"
          f"Accuracy: {result['accuracy']:.2f}\n")

In [None]:
for dataset_id in dataset_ids:
    dataset = openml.datasets.get_dataset(dataset_id, 
                                          download_data = True, 
                                          download_qualities =True, 
                                          download_features_meta_data=True
                                         )
    print(dataset.name)
    
    X, y, categorical_indicator, attribute_names = dataset.get_data(target=dataset.default_target_attribute)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

    # Identify categorical and numerical columns
    numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

    # Preprocessing for numerical data
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())])

    # Preprocessing for categorical data
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('oe', OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = -1))])

    # Bundle preprocessing for numerical and categorical data
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)])

    # Define the model
    model = Pipeline(steps=[('preprocessor', preprocessor),
                            ('classifier', RandomForestClassifier(random_state=42))])
    
    # Fit the model
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    # Get feature importances
    if hasattr(model.named_steps['classifier'], 'feature_importances_'):
        importances = model.named_steps['classifier'].feature_importances_
    else:
        importances = model.named_steps['classifier'].coef_[0]
    
    # Get feature names after preprocessing
    feature_names = numerical_cols + categorical_cols
    
    # Summarize feature importances
    forest_importances = pd.Series(importances, index=feature_names)
    
    # Sort the feature importances in descending order and select the top 10
    forest_importances = forest_importances.sort_values(ascending=False)[:10]

    # Plotting
    fig, ax = plt.subplots(figsize=(10, 6))
    forest_importances.plot.bar(ax=ax)
    ax.set_title("Top 10 Feature Importances Using Random Forest")
    ax.set_ylabel("Mean decrease in impurity")
    ax.set_xlabel("Features")
    plt.xticks(rotation=45)
    plt.show()

    # Sort features and select the top two
    top_two_features = forest_importances.nlargest(2).index.tolist()

    # Create a DataFrame for plotting
    transformed_data = pd.DataFrame(model.named_steps['preprocessor'].transform(X_test), 
                                    columns=feature_names)
    transformed_data['target'] = y_test.to_numpy()

    # Plotting
    plt.figure(figsize=(10, 6))
    for class_value in np.unique(transformed_data['target']):
        plt_data = transformed_data[transformed_data['target'] == class_value]
        plt.scatter(plt_data[top_two_features[0]], plt_data[top_two_features[1]], 
                    alpha=0.5, label=f'Class {class_value}')

    plt.xlabel(top_two_features[0])
    plt.ylabel(top_two_features[1])
    plt.title(f"Scatter Plot of Top Two Features for Dataset {dataset_id}")
    plt.legend()
    plt.show()