## Kod do trenowania wszystkich zbiorów

In [1]:
import os
import pandas as pd
import numpy as np
from autoPyTorch.api.tabular_classification import TabularClassificationTask
from sklearn.metrics import roc_auc_score

import tempfile as tmp
import warnings

os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
os.environ['OMP_NUM_THREADS'] = '1'
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['MKL_NUM_THREADS'] = '1'

warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
def pipeline(X,y):
    for feature in X.columns:
        if X[feature].dtype == 'object':
            X[feature] = X[feature].astype('category')
        if y.dtype == 'object':
            y = y.astype('category')
    return X, y

In [3]:
def run(dataframe, target_name, fold_train, fold_test):
    
    data_train = dataframe.loc[fold_train]
    data_test = dataframe.loc[fold_test]
    X_train = data_train.drop(target_name, axis=1)
    y_train = data_train[target_name]
    X_test = data_test.drop(target_name, axis=1)
    y_test = data_test[target_name]
    
    X_train,y_train =  pipeline(X_train,y_train)
    X_test, y_test = pipeline(X_test, y_test)
    
    
    api = TabularClassificationTask()
    api.search(X_train= X_train, y_train= y_train,X_test = X_test,
                      y_test = y_test, optimize_metric='roc_auc',
               total_walltime_limit=360, func_eval_time_limit_secs=40,memory_limit=None) 
    y_pred = api.predict_proba(X_test)
    # score = api.score(y_pred, y_test)
    score = roc_auc_score(y_test, y_pred[:, 1])
    
    return api, score

In [4]:
data_folders = os.listdir('datasets/')
data_folders

['168868',
 '168912',
 '31',
 '3917',
 '7592',
 '146818',
 '168908',
 '168911',
 '189354',
 '168337',
 '3945',
 '168335',
 '9977',
 '189356',
 '168338',
 '9952',
 '167120',
 '10101',
 '14965',
 '34539',
 '146606',
 '3']

In [7]:
# mozna wybrac tylko kilka zbiorow np:
# data_folders = ['168868']

for file_name in data_folders:
    print('File:', file_name)
    data_folder = 'datasets/' + str(file_name)
    dataset = pd.read_csv(data_folder + '/dataset.csv', index_col = 0)
    with open(data_folder + '/target_name.txt') as f:
        target_name = f.readline()
    scores = []
    models = []
    for j in range(10):
        print('Fold:', j)
        fold_train = np.loadtxt(data_folder + '/fold_' + str(j) + '_train.txt', dtype='int')
        fold_test = np.loadtxt(data_folder + '/fold_' + str(j) + '_test.txt', dtype='int')
        api, score = run(dataset, target_name, fold_train, fold_test)
        print(score)
        scores.append(score)
        models.append(api.show_models())
    print('scores:', scores)
    output = pd.DataFrame({'score': scores, 'models': models})
    output.to_csv('scores/' + file_name + '.csv', index=False)
        