Prepare meta-features

In [5]:
import pandas as pd
import math

def prepare_metafeatures():
    jsons = pd.read_json("./data/metafeatures.json")
    metafeatures = ['DefaultAccuracy', 'TotalDistinctClasses', 'UnseenInTrain', 'RatioTrainToPower', 'RatioTestToPower', 
    'RatioTotalToPower', 'RatioUnseenToTest', 'Attributes', 'Distinct labelsets', 'Instances', 'Labels', 'LxIxF', 
    'Ratio of number of instances to the number of attributes', 'Cardinality', 'Density', 'Maximal entropy of labels', 'Mean of entropies of labels', 
    'Minimal entropy of labels', 'Standard deviation of label cardinality', 'CVIR inter class', 'Kurtosis cardinality', 'Max IR inter class', 'Max IR intra class', 
    'Max IR per labelset', 'Mean of IR inter class', 'Mean of IR intra class', 'Mean of IR per labelset', 'Mean of standard deviation of IR intra class', 
    'Proportion of maxim label combination (PMax)', 'Proportion of unique label combination (PUniq)', 'Skewness cardinality', 
    'Average examples per labelset', 'Bound', 'Diversity', 'Number of labelsets up to 10 examples', 'Number of labelsets up to 2 examples', 'Number of labelsets up to 50 examples', 
    'Number of labelsets up to 5 examples', 'Mean examples per labelset', 'Number of unconditionally dependent label pairs by chi-square test', 'Proportion of distinct labelsets', 
    'Ratio of number of labelsets up to 10 examples', 'Ratio of number of labelsets up to 2 examples', 'Ratio of number of labelsets up to 50 examples', 'Ratio of number of labelsets up to 5 examples', 
    'Ratio of unconditionally dependent label pairs by chi-square test', 'SCUMBLE', 'Standard deviation of examples per labelset', 'Number of unique labelsets', 
    'Average absolute correlation between numeric attributes', 'Average gain ratio', 'Number of binary attributes', 'Mean of entropies of nominal attributes', 'Mean of kurtosis', 
    'Mean of mean of numeric attributes', 'Mean of skewness of numeric attributes', 'Mean of standard deviation of numeric attributes', 'Number of nominal attributes', 
    'Number of numeric attributes', 'Proportion of binary attributes', 'Proportion of nominal attributes', 'Proportion of numeric attributes', 'Proportion of numeric attributes with outliers']

    colms = ['DATASET'] + metafeatures


    data = []
    for index, row in jsons.iterrows():
        dict_m = {}


        for jsonElement in row['metafeatures']['train']:
            k = list(jsonElement.keys())[0]
            v = list(jsonElement.values())[0]['value']
            dict_m[k] = v

        arr = []
        arr.append(row['name'])
        for metafeature in metafeatures:
            try:
                val =  math.nan  if (dict_m[metafeature] == None) else dict_m[metafeature]
            except:
                val = math.nan  
            arr.append(val)
        data.append(arr)


    df_mf = pd.DataFrame(data=data, columns=colms)


    # remove constant value columns
    df_mf = df_mf.loc[:, (df_mf != df_mf.iloc[0]).any()] 
    df_mf = df_mf.drop(["RatioTrainToPower",  "RatioTestToPower"], axis=1)
    df_mf['DATASET'] = df_mf.apply(lambda row: row.DATASET.upper(), axis=1)


    df_mf.set_index('DATASET', inplace=True)
    # print(df_mf.columns.shape)

    df_mf.to_csv('./data/metafeatures.csv')
    print(df_mf.shape)

prepare_metafeatures()


(40, 61)


In [None]:
from utils import prepare_metafeatures_and_regression_performance_data
from itertools import combinations
import numpy as np
import os

metric, lower_is_better = 'HAMMING LOSS example based', True
# metric, lower_is_better = 'MACRO F1', False
# metric, lower_is_better = 'F1 example based', False
# metric, lower_is_better = 'MICRO F1', False
# metric, lower_is_better = 'AUCROC MICRO', False
top_k = 5
# lower_is_better = False

# prepare metafeatures (they are always the same accross metrics and tasks) and regression performance data
learning_task = 'regression'
df_x, df_y, algo_portfolio, algo_counts = prepare_metafeatures_and_regression_performance_data(metric, lower_is_better, top_k)


os.makedirs(f"./processed_data/{metric}/regression/", exist_ok=True) 
os.makedirs(f"./processed_data/{metric}/classification/", exist_ok=True) 
os.makedirs(f"./processed_data/{metric}/pairwise_regression/", exist_ok=True) 
os.makedirs(f"./processed_data/{metric}/pairwise_classification/", exist_ok=True) 
os.makedirs(f"./processed_data/{metric}/cost_sensitive_pairwise_classification/", exist_ok=True) 

df_x.to_csv(f"./processed_data/metafeatures.csv")
df_y.to_csv(f"./processed_data/{metric}/{learning_task}/performance.csv")
np.save(f"./processed_data/{metric}/algo_portfolio.npy", algo_portfolio, allow_pickle=True)
np.save(f"./processed_data/{metric}/algo_counts.npy", algo_counts, allow_pickle=True)



# based on the regression performance data, construct data for the other learning tasks
df = pd.read_csv(f"./processed_data/{metric}/{learning_task}/performance.csv", index_col = 0)
original_targets = df.columns


# pairwise regression
pairwise_combinations = list(combinations(original_targets, 2))
for pair in pairwise_combinations:
    target_name = f"{pair[0]}_vs_{pair[1]}"
    df[target_name] = df[pair[0]] - df[pair[1]]  # Placeholder operation, adjust as necessary
df_PR = df.iloc[:,top_k:]
df_PR.to_csv(f"./processed_data/{metric}/pairwise_regression/performance.csv")


# pairwise classification
df = df.iloc[:,0:top_k]
for pair in pairwise_combinations:
    target_name = f"{pair[0]}_vs_{pair[1]}"
    if lower_is_better:
        # If lower values are better, 1 when the first algorithm outperforms the second
        df[target_name] = [0 if df.loc[idx][pair[0]] < df.loc[idx][pair[1]] else 1 for idx in df.index]
    else:
        # If higher values are better, 1 when the first algorithm outperforms the second
        df[target_name] = [0 if df.loc[idx][pair[0]] > df.loc[idx][pair[1]] else 1 for idx in df.index]
df_PC = df.iloc[:,top_k:]
df_PC.to_csv(f"./processed_data/{metric}/pairwise_classification/performance.csv")

# classification
df = df.iloc[:,0:top_k]
best_algos = []
for idx in df.index:
    if lower_is_better:
        best_algo = df.loc[idx, original_targets].idxmin()
    else:
        best_algo = df.loc[idx, original_targets].idxmax()
    best_algos.append(best_algo)
df['C'] = best_algos
df[['C']].to_csv(f"./processed_data/{metric}/classification/performance.csv")

# cost sensitive pairwise classification
df = df.iloc[:,0:top_k]
for pair in pairwise_combinations:
    target_name = f"{pair[0]}_vs_{pair[1]}"
    if lower_is_better:
        # If lower values are better, 1 when the first algorithm outperforms the second
        df[target_name] = [0 if df.loc[idx][pair[0]] < df.loc[idx][pair[1]] else 1 for idx in df.index]
    else:
        # If higher values are better, 1 when the first algorithm outperforms the second
        df[target_name] = [0 if df.loc[idx][pair[0]] > df.loc[idx][pair[1]] else 1 for idx in df.index]
    df[target_name+"_cost"] = [np.abs(df.loc[idx][pair[0]] - df.loc[idx][pair[1]]) for idx in df.index]
df_CS_PC = df.iloc[:,top_k:]
df_CS_PC.to_csv(f"./processed_data/{metric}/cost_sensitive_pairwise_classification/performance.csv")