In [1]:
import numpy as np
import pandas as pd
from ucimlrepo import fetch_ucirepo 

In [2]:
dataset_ids = {
  'yeast':110, 
  'banknote': 267, 
  'beans':602, 
  'rice': 545, 
  'raisins': 850,
  'telescope': 159,
  'wine_quality': 186,
  'abalone': 1,
  'maintenance': 601,
  'obesity': 544
}

In [3]:
datasets = {}
variables = {}
targets = {}

for dataset_name, dataset_id in dataset_ids.items():
  repo = fetch_ucirepo(id=dataset_id)
  datasets[dataset_name] = repo.data.features
  variables[dataset_name] = repo.variables
  targets[dataset_name] = repo.data.targets

# Filters data that isn't continuous
for dataset_name, data in variables.items():
  to_drop = data[(data['type'] != 'Continuous') | (data['role'] != 'Feature')]['name'].to_list()

  for column in to_drop:
    try:
      datasets[dataset_name] = datasets[dataset_name].drop(column, axis='columns')
    except KeyError:
      pass

In [4]:
code_map = {}

# Codifica targets como inteiros
for dataset_name, target in targets.items():
  cat = pd.Categorical(target.iloc[:,0])
  targets[dataset_name] = cat.codes
  # Dictionary to map the target to the original values
  code_map[dataset_name] = {i: x for i, x in enumerate(cat.categories)}

In [5]:
for dataset_name, data in datasets.items():
  centers = np.unique(targets[dataset_name]).size

  with open('real-instances/' + dataset_name + '.dat', 'w') as f:
    f.write(f"CENTERS:{centers}\n")
    # f.write(f"COV:{args.cov_list}\n") # N sei oq fz com isso aq
    # f.write(f"POINTS_PER_CENTER:{0}\n") # Acho que isso n se aplica aq
    # f.write(f"CENTERS_COORDS:{args.centers}\n") # Isso a gente não sabe a princípio nos dados reais
    f.write(f"LABEL_DICT:{str(code_map[dataset_name]).replace(':', '=')}\n")
    f.write(f"POINTS/LABELS: \n")

    for index, row in data.sample(700, random_state=42).iterrows():
      f.write(';'.join([str(x) for x in row]) + ';' + str(targets[dataset_name][index]) + '\n')