In [None]:
!pip install fedot

In [2]:
import fedot
from fedot.api.main import Fedot

import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import time
import os

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

SEED = 2022

from IPython.display import clear_output

data_path = '/content/drive/MyDrive/Datasets/diploma/datasets/'
for dirpath, dirnames, filenames in os.walk(data_path):
    path = dirpath
    filelist = filenames

In [4]:
len(filelist)

147

In [None]:
data1 = pd.read_csv('/content/fedot_metabase_3_0-14.csv')
data2 = pd.read_csv('/content/fedot_metabase_3_15-28.csv')
data3 = pd.read_csv('/content/fedot_metabase_3_29-46.csv')
data4 = pd.read_csv('/content/fedot_metabase_3_46-64.csv')

full = pd.concat([data1,data2, data3, data4],0,ignore_index=True)
# full = full[full['0']>0.6]

# full.to_csv('meta_base_v1.csv', index=False)

In [None]:
score_list = [[],[]]
for i, filename in zip(range(55,65), filelist):
  if i in [44,51,52,58]:
    continue
  clear_output(wait=True)
  print(str(i)+'/'+str(len(filelist)))

  data = pd.read_csv(os.path.join(data_path, filelist[i]))
  if data.columns[0] == 'Unnamed: 0':
      data = pd.read_csv(os.path.join(data_path, filelist[i]), index_col=0)
  
  try:
    X, y = data.drop(['target'], axis=1), data['target']
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True, random_state=SEED)

    model = Fedot(problem='classification',seed=SEED,composer_params={
            'max_depth': 5,
            'max_arity': 5,
            'pop_size': 20,
            'num_of_generations': 20,
            'timeout': 3,
            'with_tuning': True,
            'preset': 'best_quality',
            'genetic_scheme': None,
            'history_folder': None,
            'stopping_after_n_generation': 10,
            'cv_folds': 3,
            'problem': 'classification',
            'available_operations': ['bernb', 'catboost', 'dt', 'knn', 'lda', 'lgbm', 'logit', 'mlp', 'qda', 'rf',
                                  'xgboost', 'scaling', 'normalization', 'simple_imputation', 'pca', 'kernel_pca',
                                  'poly_features', 'one_hot_encoding', 'rfe_lin_class', 'rfe_non_lin_class',
                                  'resample']})
    model.fit(features=X_train.values, target=y_train.values)
    prediction = model.predict(features=X_test.values)

    score_list[0].append(f1_score(y_test, prediction, average='weighted'))
    score_list[1].append(str(model.best_models[0]))
  except:
    score_list[0].append(0)
    score_list[1].append('None')
  clear_output(wait=False)
  score_df = pd.DataFrame(score_list).T.to_csv("fedot_metabase_3_55-"+str(i)+".csv", index=False)

In [None]:
# score_df = pd.DataFrame(score_list).T
# score_df.to_csv("fedot_metabase_1_40-60.csv", index=False)

In [None]:
# data1 = pd.read_csv('/content/fedot_metabase_2_0-15.csv')
# data2 = pd.read_csv('/content/fedot_metabase_2_20-40.csv')
# data3 = pd.read_csv('/content/fedot_metabase_2_40-60.csv')
# data = pd.concat([data1,data2,data3], axis=0, ignore_index=True)

In [None]:
name_df = []
idx = []
for i in range(len(filelist)):
  if i in [19,44,58]:
    continue
  if i == 60:
    break
  name_df.append(filelist[i])
  idx.append(i)

In [None]:
data['dataset_name'] = name_df
data['idx'] = idx

metabase = data[data['0']>0.8]
metabase = metabase.drop([17,30],0)
metabase.reset_index(drop=True, inplace=True)

In [None]:
name_list = []
for i, filename in zip(range(0,65), filelist):
  if i in [44,51,52,58]:
    continue
  name_list.append(filename)

In [None]:
full = full.drop(46)

In [None]:
len(name_list)

61

In [None]:
full['ds_name'] = name_list

In [None]:
full.to_csv('meta_base_v1.csv', index=False)

## Generation meta-features

In [None]:
!pip install pymfe

In [None]:
import pymfe
from pymfe.mfe import MFE
from pymfe.concept import MFEConcept
from pymfe.complexity import MFEComplexity
from pymfe.general import MFEGeneral
from pymfe.statistical import MFEStatistical
from pymfe.landmarking import MFELandmarking
from pymfe.model_based import MFEModelBased
from pymfe.info_theory import MFEInfoTheory
from pymfe.clustering import MFEClustering

In [None]:
full = pd.read_csv('/content/meta_base_v1.csv')

In [None]:
metafeatures = np.empty((24,61))
for g in range(58,61):
  print(g)
  # if g in [44,48,52,58]:
  #   pd.DataFrame(metafeatures.T).to_csv('mf-58-'+str(g)+'.csv', index=False)
  #   continue
  temp_df = pd.read_csv(os.path.join(data_path, filelist[g]))
  if temp_df.columns[0] == 'Unnamed: 0':
      temp_df = pd.read_csv(os.path.join(data_path, filelist[g]), index_col=0)
  X, y = temp_df.drop(['target'],1).values, temp_df.target

  i = g
  try:
    metafeatures[0,i] = np.std(MFEConcept.ft_conceptvar(X,y),ddof=1)
    metafeatures[1,i] = MFEComplexity.ft_lsc(X,y)
    metafeatures[2,i] = np.std(MFEGeneral.ft_freq_class(X,y),ddof=1)
    metafeatures[3,i] = np.mean(MFEComplexity.ft_n3(X,y.values))
    metafeatures[4,i] = MFEStatistical.ft_nr_cor_attr(X)
    metafeatures[5,i] = np.mean(MFEComplexity.ft_f1(X,y))
    metafeatures[6,i] = MFEComplexity.ft_c2(X,y)
    metafeatures[7,i] = np.mean(MFEComplexity.ft_f4(X,y))
    metafeatures[8,i] = MFEComplexity.ft_n1(X,y.values)
    metafeatures[9,i] = np.mean(MFEComplexity.ft_l1(X,y))
    metafeatures[10,i] = np.mean(MFELandmarking.ft_best_node(X,y,score=pymfe.scoring.accuracy))
    metafeatures[11,i] = np.mean(MFELandmarking.ft_linear_discr(X,y,score=pymfe.scoring.accuracy))
    metafeatures[12,i] = MFE(groups=["model-based"]).fit(X,y.values).extract()[1][7]
    metafeatures[13,i] = MFEGeneral.ft_nr_class(X,y)
    metafeatures[14,i] = np.mean(MFEGeneral.ft_freq_class(X,y))
    metafeatures[15,i] = np.mean(MFELandmarking.ft_elite_nn(X,y,score=pymfe.scoring.accuracy))
    metafeatures[16,i] = np.mean(MFEConcept.ft_conceptvar(X,y))
    metafeatures[17,i] = np.mean(MFEComplexity.ft_l2(X,y))
    metafeatures[18,i] = np.mean(MFEComplexity.ft_f1v(X,y))
    metafeatures[19,i] = MFEClustering.ft_nre(X,y)
    metafeatures[20,i] = np.mean(MFELandmarking.ft_random_node(X,y,score=pymfe.scoring.accuracy))
    metafeatures[21,i] = np.mean(MFELandmarking.ft_worst_node(X,y,score=pymfe.scoring.accuracy))
    metafeatures[22,i] = np.mean(MFEComplexity.ft_l3(X,y))
    metafeatures[23,i] = np.mean(np.log1p(MFEInfoTheory.ft_class_ent(X,y)))
  except:
    pass
  pd.DataFrame(metafeatures.T).to_csv('mf-58-'+str(i)+'.csv', index=False)

In [None]:
full.drop([19,29,44,48,52,58]).shape

(55, 3)

In [None]:
metafeats = pd.concat([pd.read_csv('/content/mf-0-14.csv')[:14],pd.read_csv('/content/mf-15-18.csv')[15:18],pd.read_csv('/content/mf-19-28.csv')[19:28],\
                       pd.read_csv('/content/mf-29-43.csv')[29:43],pd.read_csv('/content/mf-44-51.csv')[44:51],pd.read_csv('/content/mf-52-57.csv')[52:57],pd.read_csv('/content/mf-58-60.csv')[58:60]], axis=0, ignore_index=True)

In [None]:
metafeats.columns = ['conceptvar_sd','lsc','freq_class_sd','n3_mean','nr_cor_attr','f1_mean','c2','f4_mean','n1','l1_mean',
                                    'best_node_mean','linear_discr_mean','leaves_per_class_mean','nr_class','freq_class_mean','elite_nn_mean','conceptvar_mean',
                                    'l2_mean','f1v_mean','nre','random_node_mean','worst_node_mean','l3_mean','class_ent']

In [None]:
metabase = pd.concat([full.drop([19,29,44,48,52,58]).reset_index(drop=True), metafeats.reset_index(drop=True)],1).drop([47,44,40,26,17,54])

In [None]:
#metabase.to_csv('/content/meta_base_v1.csv', index=False)

## Эксперименты

In [None]:
metabase = pd.read_csv('/content/meta_base_v1.csv')

In [None]:
import re

In [None]:
def parse_configs(configs):
  params = dict()
  params['max_depth'] = int(re.findall('[0-9]',configs)[0])
  params['max_arity'] = int(re.findall('[0-9]',configs)[0])
  params['available_operations'] = re.sub('[\,\[\]]','',re.findall('\[.{0,}\]',configs)[0]).split(' ')
  return params
#parse_configs(metabase['1'][0])

In [None]:
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
import pickle

In [None]:
metabase['idx'] = metabase.index

In [None]:
#обучение леса
knn = KNeighborsClassifier(n_neighbors=3, weights='distance')
knn.fit(metabase.iloc[:,3:-1].fillna(0), metabase.iloc[:,-1])
knn.predict(metabase.iloc[:,3:-1].fillna(0))

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 41, 42, 43, 45, 46, 48, 49, 50, 51, 52, 53])

In [None]:
# with open('metasearcher.pickle', 'wb') as f:
#   pickle.dump(knn, f)

In [None]:
def get_metafeats(df):
  metafeatures = np.empty((24,1))
  X, y = df.drop(['target'],1).values, df.target

  metafeatures[0,0] = np.std(MFEConcept.ft_conceptvar(X,y),ddof=1)
  metafeatures[1,0] = MFEComplexity.ft_lsc(X,y)
  metafeatures[2,0] = np.std(MFEGeneral.ft_freq_class(X,y),ddof=1)
  metafeatures[3,0] = np.mean(MFEComplexity.ft_n3(X,y.values))
  metafeatures[4,0] = MFEStatistical.ft_nr_cor_attr(X)
  metafeatures[5,0] = np.mean(MFEComplexity.ft_f1(X,y))
  metafeatures[6,0] = MFEComplexity.ft_c2(X,y)
  metafeatures[7,0] = np.mean(MFEComplexity.ft_f4(X,y))
  metafeatures[8,0] = MFEComplexity.ft_n1(X,y.values)
  metafeatures[9,0] = np.mean(MFEComplexity.ft_l1(X,y))
  metafeatures[10,0] = np.mean(MFELandmarking.ft_best_node(X,y,score=pymfe.scoring.accuracy))
  metafeatures[11,0] = np.mean(MFELandmarking.ft_linear_discr(X,y,score=pymfe.scoring.accuracy))
  metafeatures[12,0] = MFE(groups=["model-based"]).fit(X,y.values).extract()[1][7]
  metafeatures[13,0] = MFEGeneral.ft_nr_class(X,y)
  metafeatures[14,0] = np.mean(MFEGeneral.ft_freq_class(X,y))
  metafeatures[15,0] = np.mean(MFELandmarking.ft_elite_nn(X,y,score=pymfe.scoring.accuracy))
  metafeatures[16,0] = np.mean(MFEConcept.ft_conceptvar(X,y))
  metafeatures[17,0] = np.mean(MFEComplexity.ft_l2(X,y))
  metafeatures[18,0] = np.mean(MFEComplexity.ft_f1v(X,y))
  metafeatures[19,0] = MFEClustering.ft_nre(X,y)
  metafeatures[20,0] = np.mean(MFELandmarking.ft_random_node(X,y,score=pymfe.scoring.accuracy))
  metafeatures[21,0] = np.mean(MFELandmarking.ft_worst_node(X,y,score=pymfe.scoring.accuracy))
  metafeatures[22,0] = np.mean(MFEComplexity.ft_l3(X,y))
  metafeatures[23,0] = np.mean(np.log1p(MFEInfoTheory.ft_class_ent(X,y)))
  return metafeatures.T

In [None]:
results1 = [[],[],[]]
for i in range(108,115):
  # if i in [73, 109]:
  #   continue
  clear_output(wait=False)
  print(i)
  temp_df = pd.read_csv(os.path.join(data_path, filelist[i]))
  if temp_df.columns[0] == 'Unnamed: 0':
      temp_df = pd.read_csv(os.path.join(data_path, filelist[i]), index_col=0)

  X, y = temp_df.drop(['target'], axis=1), temp_df['target']
  X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True, random_state=SEED)

  model = Fedot(problem='classification',seed=47,
      composer_params={'timeout':3})
  start = time.time()
  model.fit(features=X_train.values, target=y_train.values)
  results1[0].append(filelist[i])
  results1[1].append(time.time()-start)

  prediction = model.predict(features=X_test.values)
  results1[2].append(f1_score(y_test, prediction, average='weighted'))
  pd.DataFrame(results1).to_csv('default_fedot-108'+str(i)+'.csv',)

114


Generations:   5%|▌         | 1/20 [01:20<?, ?gen/s]

Hyperparameters optimization start





  0%|          | 4/1000 [11:21<47:09:19, 170.44s/trial, best loss: -0.9108903620329387]
Hyperparameters optimization finished
Return tuned pipeline due to the fact that obtained metric 0.911 equal or bigger than initial (- 5% deviation) 0.885


In [None]:
pd.DataFrame(results1).to_csv('notmy.csv', index=False)

In [None]:
#metabase = metabase.set_index('idx')

In [None]:
results = [[],[],[]]
for i in range(108, 115):
  # if i in [51,52, 73, 75,80,82,83,91, 94,99,100,102,108,109]:
  #   continue
  clear_output(wait=False)
  print(i)
  temp_df = pd.read_csv(os.path.join(data_path, filelist[i]))
  if temp_df.columns[0] == 'Unnamed: 0':
      temp_df = pd.read_csv(os.path.join(data_path, filelist[i]), index_col=0)

  #try:
  temp_df_metafeats = get_metafeats(temp_df)
  temp_meta_idx = knn.predict(pd.DataFrame(temp_df_metafeats).fillna(0).replace(-np.inf,0).replace(np.inf,0))[0]
  temp_params = parse_configs(metabase.loc[temp_meta_idx,'1'])
  temp_params['timeout'] = 3

  X, y = temp_df.drop(['target'], axis=1), temp_df['target']
  X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True, random_state=SEED)

  model = Fedot(problem='classification',seed=47,
      composer_params=temp_params)
  start = time.time()
  model.fit(features=X_train.values, target=y_train.values)
  results[0].append(filelist[i])
  results[1].append(time.time()-start)

  prediction = model.predict(features=X_test.values)
  results[2].append(f1_score(y_test, prediction, average='weighted'))
  # except:
  #   results[0].append(filelist[i])
  #   results[1].append(0)
  #   results[2].append(0)
  pd.DataFrame(results).to_csv('my_fedot-108'+str(i)+'.csv',)

114


Generations:  10%|█         | 2/20 [01:11<21:20, 71.12s/gen]

Hyperparameters optimization start





 21%|██▏       | 213/1000 [03:00<11:05,  1.18trial/s, best loss: -0.9321730654970525]
Hyperparameters optimization finished
Return tuned pipeline due to the fact that obtained metric 0.932 equal or bigger than initial (- 5% deviation) 0.879


In [None]:
pd.DataFrame(results).to_csv('my.csv', index=False)

In [None]:
const_composer_params = ['scaling', 'normalization', 'simple_imputation', 'pca', 'kernel_pca',
                                  'poly_features', 'one_hot_encoding',
                                  'resample']
nonconst_composer_params = ['bernb', 'catboost', 'dt','knn', 'lda', 'lgbm', 'logit', 'mlp', 'qda', 'rf','rfe_lin_class', 'rfe_non_lin_class']
sctructure_meta_base = ['dataset_name', 'score:_', 'conf','mf:_']