In [7]:
import re, unicodedata

def sanitize_cols(cols, max_len=128):
    seen = {}
    out = []
    for i, c in enumerate(cols):
        s = unicodedata.normalize('NFKD', str(c)).encode('ascii', 'ignore').decode('ascii')  # remove acentos
        s = s.strip().lower()
        s = re.sub(r'[\s\-]+', '_', s)                 # espaços e hífens -> _
        s = re.sub(r'[^0-9a-z_]', '_', s)              # só [a-z0-9_]
        s = re.sub(r'_+', '_', s).strip('_')           # colapsa múltiplos _
        if not s:                                      # vazio -> "col"
            s = 'col'
        if s[0].isdigit() or s[0] == '_':             # não iniciar por dígito nem "_"
            s = f'col_{s}'
        s = s[:max_len]                                # limita comprimento
        base = s                                       # deduplicação estável
        k = seen.get(base, 0)
        if k:
            s = f'{base}_{k}'
        seen[base] = k + 1
        out.append(s)
    return out

In [None]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd
  
# fetch dataset 
spambase = fetch_ucirepo(id=94) 
  
# data (as pandas dataframes) 
X = pd.DataFrame(spambase.data.features)
y = pd.DataFrame(spambase.data.targets)

X.columns = sanitize_cols(X.columns)
y.columns = sanitize_cols(y.columns)

Index(['word_freq_make', 'word_freq_address', 'word_freq_all', 'word_freq_3d',
       'word_freq_our', 'word_freq_over', 'word_freq_remove',
       'word_freq_internet', 'word_freq_order', 'word_freq_mail',
       'word_freq_receive', 'word_freq_will', 'word_freq_people',
       'word_freq_report', 'word_freq_addresses', 'word_freq_free',
       'word_freq_business', 'word_freq_email', 'word_freq_you',
       'word_freq_credit', 'word_freq_your', 'word_freq_font', 'word_freq_000',
       'word_freq_money', 'word_freq_hp', 'word_freq_hpl', 'word_freq_george',
       'word_freq_650', 'word_freq_lab', 'word_freq_labs', 'word_freq_telnet',
       'word_freq_857', 'word_freq_data', 'word_freq_415', 'word_freq_85',
       'word_freq_technology', 'word_freq_1999', 'word_freq_parts',
       'word_freq_pm', 'word_freq_direct', 'word_freq_cs', 'word_freq_meeting',
       'word_freq_original', 'word_freq_project', 'word_freq_re',
       'word_freq_edu', 'word_freq_table', 'word_freq_conference',


In [9]:
import sklearn
from pathlib import Path
import shutil

# %% Configurações de dataset
DATASET_NAME = 'Spambase'
RANDOM_STATE = 1

# %% Split
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
    X, y, random_state=RANDOM_STATE, stratify=y
)

# %% Diretórios temporários
TMP_ROOT = Path('../results/tmp') / DATASET_NAME
if TMP_ROOT.exists():
    shutil.rmtree(TMP_ROOT)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3450, 57), (1151, 57), (3450, 1), (1151, 1))

In [None]:
test_df = X_test.copy()
test_df['target'] = y_test
destino = '../data/' + DATASET_NAME + '_test.csv'
test_df.to_csv(destino, index=False)

train_df = X_train.copy()
train_df['target'] = y_train
destino = '../data/' + DATASET_NAME + '_train.csv'
train_df.to_csv(destino, index=False) 

In [15]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd
import sklearn
from pathlib import Path
import shutil

# fetch dataset 
dataset = fetch_ucirepo(id=697) 
  
# data (as pandas dataframes) 
X = pd.DataFrame(dataset.data.features)
y = pd.DataFrame(dataset.data.targets)

X.columns = sanitize_cols(X.columns)
y.columns = sanitize_cols(y.columns)

# %% Configurações de dataset
DATASET_NAME = 'AcademicSuccess'
RANDOM_STATE = 1

# %% Split
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
    X, y, random_state=RANDOM_STATE, stratify=y
)

# %% Diretórios temporários
TMP_ROOT = Path('../results/tmp') / DATASET_NAME
if TMP_ROOT.exists():
    shutil.rmtree(TMP_ROOT)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

test_df = X_test.copy()
test_df['target'] = y_test
destino = '../data/' + DATASET_NAME + '_test.csv'
test_df.to_csv(destino, index=False)

train_df = X_train.copy()
train_df['target'] = y_train
destino = '../data/' + DATASET_NAME + '_train.csv'
train_df.to_csv(destino, index=False) 