In [1]:
import os
import json
import pandas as pd


In [4]:

# Path to the root directory containing the subdirectories with info.json
root_dir = 'data'

# List to hold all the info dictionaries
data = []

for subdir, dirs, files in os.walk(root_dir):
    for file in files:
        if file == 'info.json':
            file_path = os.path.join(subdir, file)
            with open(file_path, 'r') as f:
                info = json.load(f)
                entry = {
                    'name': subdir.split('/')[1],
                    'task_type': info.get('task_type'),
                    'n_num_features': info.get('n_num_features'),
                    'n_cat_features': info.get('n_cat_features'),
                    'tot_fea':info.get('n_num_features') + info.get('n_cat_features'),
                    'n_classes': info.get('n_classes'),
                    'train_size': info.get('train_size'),
                    'val_size': info.get('val_size'),
                    'test_size': info.get('test_size'),
                    'openml_id': info.get('openml_id'),
                    'location': subdir,
                    'source': info.get('source')
                }
                # Calculate total size
                entry['total_size'] = sum([
                    info.get('train_size', 0),
                    info.get('val_size', 0),
                    info.get('test_size', 0)
                ])
                data.append(entry)

# Create a DataFrame
df = pd.DataFrame(data)

# Display the dataframe
print(df)

                                                  name   task_type  \
0                                                 led7  multiclass   
1                                       car-evaluation  multiclass   
2                                 auction_verification  regression   
3                                         ada_agnostic    binclass   
4    GAMETES_Heterogeneity_20atts_1600_Het_0.4_0.2_...    binclass   
..                                                 ...         ...   
296                                              heloc    binclass   
297                                 KDDCup09_upselling    binclass   
298                        Bank_Customer_Churn_Dataset    binclass   
299                              FOREX_audcad-day-High    binclass   
300        Firm-Teacher_Clave-Direction_Classification  multiclass   

     n_num_features  n_cat_features  tot_fea  n_classes  train_size  val_size  \
0                 0               7        7       10.0        2048       512 

In [7]:
df.loc[df['location'].isin(['bank',
  "heloc",
  "Long",
  "adult",
  "rl",
  "Telecom_Churn_Dataset",
  "PizzaCutter3",
  "customer_satisfaction_in_airline",
  "dabetes_130-us_hospitals",
  "Cardiovascular-Disease-dataset",
  "mobile_c36_oversampling",
  "electricity",
  "FOREX_audsgd-hour-High",
  "Click_prediction_small"])][['location', 'task_type', 'n_num_features', 'total_size']].to_csv("experimental_ds.csv")

In [5]:
df.to_csv('dataset_info.csv')

In [30]:
task_type_dict = df.groupby('task_type')['location'].apply(list).to_dict()

In [34]:
task_type_dict

{'binclass': ['ada_agnostic',
  'GAMETES_Heterogeneity_20atts_1600_Het_0.4_0.2_50_EDM-2_001',
  'Contaminant-detection-in-packaged-cocoa-hazelnut-spread-jars-using-Microwaves-Sensing-and-Machine-Learning-11.0GHz(Urbinati)',
  'credit-g',
  'pc3',
  'ozone-level-8hr',
  'ada',
  'jasmine',
  'water_quality',
  'bank',
  'Waterstress',
  'spambase',
  'online_shoppers',
  'Wilt',
  'jm1',
  'Amazon_employee_access',
  'kc1',
  'seismic+bumps',
  'naticusdroid+android+permissions+dataset',
  'Fitness_Club_c',
  'FOREX_audjpy-hour-High',
  'company_bankruptcy_prediction',
  'KDD',
  'wine',
  'PieChart3',
  'delta_ailerons',
  'htru',
  'taiwanese_bankruptcy_prediction',
  'law-school-admission-bianry',
  'electricity',
  'MIC',
  'pc1',
  'FOREX_audchf-day-High',
  'Click_prediction_small',
  'Contaminant-detection-in-packaged-cocoa-hazelnut-spread-jars-using-Microwaves-Sensing-and-Machine-Learning-10.5GHz(Urbinati)',
  'Cardiovascular-Disease-dataset',
  'Contaminant-detection-in-package