In [None]:
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics
import pandas as pd
import numpy as np
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
from dask.distributed import Client,LocalCluster
from sklearn import preprocessing

# *Airlines

In [None]:
path = '//home//maraks//Desktop//datasets//binary//airlines.csv'
data = pd.read_csv(path) 
data

In [None]:
for column in data.columns:
    if(column in ['Flight','DayOfWeek','Time','Length'] ):
        data[column] = pd.to_numeric(data[column])
    elif(column in ['Airline','AirportFrom','AirportTo','Delay'] ):
        data[column] = data[column].astype('category')
        
        le = preprocessing.LabelEncoder()
        le.fit(data[column])
        data[column] = le.transform(data[column]) 
data

In [None]:
from dask.distributed import Client,LocalCluster

cluster = LocalCluster(n_workers=1,
                       threads_per_worker=4,
                       memory_limit='6GB')
client = Client(cluster)

client

In [None]:
X,y = data.iloc[:,:-1] , data.iloc[:,-1:]

X_train, X_test, y_train, y_test = train_test_split(X, y,
                train_size=0.75, test_size=0.25, random_state=42)
y_train=y_train.values.ravel()
y_test=y_test.values.ravel()

tpot = TPOTClassifier(                  
    generations=7, population_size=15, offspring_size=None, 
    mutation_rate=0.9, crossover_rate=0.1, scoring='neg_log_loss',                                   
    config_dict=None, cv=5, use_dask=True, n_jobs=1,
    max_eval_time_mins=60, verbosity=2, random_state=42
)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_1.py')

In [None]:
client.close()
cluster.close()

In [None]:
pd.DataFrame(tpot.evaluated_individuals_) #

In [None]:
# -0.611405 logloss
#Best pipeline: RandomForestClassifier(MinMaxScaler(input_matrix), bootstrap=True, criterion=gini, max_features=0.2, min_samples_leaf=8, min_samples_split=4, n_estimators=100)

In [None]:
pd.DataFrame(tpot.evaluated_individuals_).to_csv('1best.csv')

# *Amazon employee access

In [None]:
path = '//home//maraks//Desktop//datasets//binary//Amazon_employee_access.csv'
data = pd.read_csv(path)
data

In [None]:
for column in data.columns:

    if(column in ['target'] ):
        data[column] = data[column].astype('category')
    elif(column in ['RESOURCE','MGR_ID','ROLE_ROLLUP_1','ROLE_ROLLUP_2',
                'ROLE_DEPTNAME','ROLE_TITLE','ROLE_FAMILY_DESC', 
                'ROLE_FAMILY', 'ROLE_CODE'] ):
        data[column] = pd.to_numeric(data[column])


In [None]:
from dask.distributed import Client,LocalCluster


cluster = LocalCluster(n_workers=1, 
                       threads_per_worker=4,
                       memory_limit='6GB')
client = Client(cluster)

#client = Client(n_workers=4, threads_per_worker=1)
client

In [None]:
X,y = data.iloc[:,:-1] , data.iloc[:,-1:]

X_train, X_test, y_train, y_test = train_test_split(X, y,
                train_size=0.75, test_size=0.25, random_state=42)
y_train=y_train.values.ravel()
y_test=y_test.values.ravel()

tpot = TPOTClassifier(                  
                      generations=7, 
                      population_size=15,
                      offspring_size=None, 
                      mutation_rate=0.9,
                      crossover_rate=0.1,
                      scoring='neg_log_loss',
                      config_dict=None,
                      cv=5,
                      use_dask=True,
                      n_jobs=1,
                      max_eval_time_mins=2,
                      verbosity=2, 
                      random_state=42
)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_2.py')

In [None]:
# -0.1703658
client.close()
cluster.close()

In [None]:
pd.DataFrame(tpot.evaluated_individuals_) #

# *blood-transfusion-service-center

In [None]:
path = '//home//maraks//Desktop//datasets//binary//blood-transfusion-service-center.csv'
data = pd.read_csv(path)
data

In [None]:
for column in data.columns:

    if(column in ['V1','V2','V3','V4'] ):
        data[column] = pd.to_numeric(data[column])
    elif(column in ['Class'] ):
        data[column] = data[column].astype('int64')

In [None]:
#data.rename(columns={'Class': 'class'}, inplace=True)

In [None]:
#data['class'] = data['class'].map({1:-1,2:1})

In [None]:
from dask.distributed import Client,LocalCluster

cluster = LocalCluster(n_workers=1, 
                       threads_per_worker=4,
                       memory_limit='6GB')
client = Client(cluster)

#client = Client(n_workers=4, threads_per_worker=1)
client

In [None]:

X,y = data.iloc[:,:-1] , data.iloc[:,-1:]

X_train, X_test, y_train, y_test = train_test_split(X, y,
                train_size=0.75, test_size=0.25, random_state=42)
y_train=y_train.values.ravel()
y_test=y_test.values.ravel()

tpot = TPOTClassifier(                  
                      generations=7, 
                      population_size=15,
                      offspring_size=None, 
                      mutation_rate=0.9,
                      crossover_rate=0.1,
                      scoring='neg_log_loss',
                      config_dict=None,
                      cv=5,
                      use_dask=True,
                      n_jobs=1,
                      max_eval_time_mins=2,
                      verbosity=2, 
                      random_state=42
)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_3.py')

#from shutil import rmtree
#rmtree(cachedir)

In [None]:
# -0.469163
client.close()
cluster.close()

In [None]:
pd.DataFrame(tpot.evaluated_individuals_) # 6 x 114

In [None]:
#tpot.pareto_front_fitted_pipelines_ # verb = 3

In [None]:
tpot.op_list

# *Cifar-10

In [None]:
path = '//home//maraks//Desktop//datasets//binary//cifar-10-binary.csv'
data = pd.read_csv(path)
data

In [None]:
for column in data.columns:
    if(column in ['class'] ):
        data[column] = data[column].astype('category')
    else:
        data[column] = pd.to_numeric(data[column])

In [None]:
cluster = LocalCluster(n_workers=1,
                       threads_per_worker=4,
                       memory_limit='6GB')
client = Client(cluster)

client

In [None]:
X,y = data.iloc[:,:-1] , data.iloc[:,-1:]

X_train, X_test, y_train, y_test = train_test_split(X, y,
                train_size=0.75, test_size=0.25, random_state=42)
y_train=y_train.values.ravel()
y_test=y_test.values.ravel()

tpot = TPOTClassifier(                  
                      generations=7, 
                      population_size=15,
                      offspring_size=None, 
                      mutation_rate=0.9,
                      crossover_rate=0.1,
                      scoring='neg_log_loss',
                      config_dict=None,
                      cv=5,
                      use_dask=True,
                      n_jobs=1,
                      max_eval_time_mins=2,
                      verbosity=2, 
                      random_state=42
)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_3.py')

In [None]:
client.close()
cluster.close()

In [None]:
pd.DataFrame(tpot.evaluated_individuals_) 

In [None]:
-0.450685
-0.450777
-0.457732 
-0.5139

# *Connect-4-balanced

In [None]:
path = '//home//maraks//Desktop//datasets//binary//connect-4-balanced-binary.csv'
data = pd.read_csv(path)
data

In [None]:
for column in data.columns:
    data[column] = pd.to_numeric(data[column])

In [None]:
# stable
cluster = LocalCluster(n_workers=1,
                       threads_per_worker=4,
                       memory_limit='6GB')
client = Client(cluster)

client

In [None]:
# stable
X,y = data.iloc[:,:-1] , data.iloc[:,-1:]

X_train, X_test, y_train, y_test = train_test_split(X, y,
                train_size=0.75, test_size=0.25, random_state=42)
y_train=y_train.values.ravel()
y_test=y_test.values.ravel()

tpot = TPOTClassifier(                  
                      generations=3, 
                      population_size=5,
                      offspring_size=None, 
                      mutation_rate=0.9,
                      crossover_rate=0.1,
                      scoring='neg_log_loss',
                      config_dict=None,
                      cv=5,
                      use_dask=True,
                      n_jobs=1,
                      max_eval_time_mins=2,
                      verbosity=2, 
                      random_state=42
)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_3.py')

# 10/20 [06:04<08:49, 52.93s/pipeline] no Dask, 1 job
# 11/20 [02:54<05:07, 34.21s/pipeline]  no Dask, 4 job ( crash 1 - Current best internal CV score: -inf)

# 10/20 [04:58<03:48, 22.84s/pipeline] Dask, 1 worker, 1 job
# 10/20 [01:47<01:19, 7.90s/pipeline] - Dask, 1 worker, 4 jobs (best result)

# CV internal score - 0.8642350333749977

In [None]:
# -0.361084 logloss

client.close()
cluster.close()

In [None]:
pd.DataFrame(tpot.evaluated_individuals_)

# *Connect-4-imbalanced

In [None]:
path = '//home//maraks//Desktop//datasets//binary//connect-4-imbalanced-binary.csv'
data = pd.read_csv(path)
data

In [None]:
for column in data.columns:
    data[column] = pd.to_numeric(data[column])

In [None]:
cluster = LocalCluster(n_workers=1,
                       threads_per_worker=2,
                       memory_limit='6GB')
client = Client(cluster)

client

In [None]:
X,y = data.iloc[:,:-1] , data.iloc[:,-1:]

X_train, X_test, y_train, y_test = train_test_split(X, y,
                train_size=0.75, test_size=0.25, random_state=42)
y_train=y_train.values.ravel()
y_test=y_test.values.ravel()

tpot = TPOTClassifier(                  
                      generations=3, 
                      population_size=5,
                      offspring_size=None, 
                      mutation_rate=0.9,
                      crossover_rate=0.1,
                      scoring='neg_log_loss',
                      config_dict=None,
                      cv=5,
                      use_dask=True,
                      n_jobs=1,
                      max_eval_time_mins=2,
                      verbosity=2, 
                      random_state=42
)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_3.py')

# 0.8596326115476478

In [None]:
#-0.403765
#-0.419837 
#-0.524874
client.close()
cluster.close()

In [None]:
pd.DataFrame(tpot.evaluated_individuals_)

# *Fashion-MNIST

In [None]:
path = '//home//maraks//Desktop//datasets//binary//Fashion-MNIST-binary.csv'
data = pd.read_csv(path)
data

In [None]:
for column in data.columns:
        data[column] = pd.to_numeric(data[column])

In [None]:
cluster = LocalCluster(n_workers=1,
                       threads_per_worker=4,
                       memory_limit='6GB')
client = Client(cluster)

client

In [None]:
X,y = data.iloc[:,:-1] , data.iloc[:,-1:]

X_train, X_test, y_train, y_test = train_test_split(X, y,
                train_size=0.75, test_size=0.25, random_state=42)
y_train=y_train.values.ravel()
y_test=y_test.values.ravel()

tpot = TPOTClassifier(                  
                      generations=3, 
                      population_size=5,
                      offspring_size=None, 
                      mutation_rate=0.9,
                      crossover_rate=0.1,
                      scoring='neg_log_loss',
                      config_dict=None,
                      cv=5,
                      use_dask=True,
                      n_jobs=1,
                      max_eval_time_mins=2,
                      verbosity=2, 
                      random_state=42
)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_3.py')
#  5/20 [04:01<09:00, 36.01s/pipeline]

In [None]:
# -0.29863017411905346
client.close()
cluster.close()

In [None]:
pd.DataFrame(tpot.evaluated_individuals_)

# *Jungle chess

In [None]:
path = '//home//maraks//Desktop//datasets//binary//jungle_chess_2pcs_raw_endgame_complete-binary.csv'
data = pd.read_csv(path)
data

In [None]:
for column in data.columns:
    data[column] = pd.to_numeric(data[column])

In [None]:
cluster = LocalCluster(n_workers=1,
                       threads_per_worker=4,
                       memory_limit='6GB')
client = Client(cluster)

client

In [None]:
X,y = data.iloc[:,:-1] , data.iloc[:,-1:]

X_train, X_test, y_train, y_test = train_test_split(X, y,
                train_size=0.75, test_size=0.25, random_state=42)
y_train=y_train.values.ravel()
y_test=y_test.values.ravel()

tpot = TPOTClassifier(                  
                      generations=3, 
                      population_size=5,
                      offspring_size=None, 
                      mutation_rate=0.9,
                      crossover_rate=0.1,
                      scoring='neg_log_loss',
                      config_dict=None,
                      cv=5,
                      use_dask=True,
                      n_jobs=1,
                      max_eval_time_mins=2,
                      verbosity=2, 
                      random_state=42
)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_3.py')

# 0.9058985751493795

In [None]:
#-0.31102109254766236
client.close()
cluster.close()

In [None]:
pd.DataFrame(tpot.evaluated_individuals_)

# *kc1

In [None]:
path = '//home//maraks//Desktop//datasets//binary//kc1.csv'
data = pd.read_csv(path)
data

In [None]:
for column in data.columns:
    data[column] = pd.to_numeric(data[column])

In [None]:
cluster = LocalCluster(n_workers=1,
                       threads_per_worker=4,
                       memory_limit='6GB')
client = Client(cluster)

client

In [None]:
X,y = data.iloc[:,:-1] , data.iloc[:,-1:]

X_train, X_test, y_train, y_test = train_test_split(X, y,
                train_size=0.75, test_size=0.25, random_state=42)
y_train=y_train.values.ravel()
y_test=y_test.values.ravel()

tpot = TPOTClassifier(                  
                      generations=3, 
                      population_size=5,
                      offspring_size=None, 
                      mutation_rate=0.9,
                      crossover_rate=0.1,
                      scoring='neg_log_loss',
                      config_dict=None,
                      cv=5,
                      use_dask=True,
                      n_jobs=1,
                      max_eval_time_mins=2,
                      verbosity=2, 
                      random_state=42
)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_3.py')

# 0.8608533322684983

In [None]:
#-0.3435855
client.close()
cluster.close()

In [None]:
pd.DataFrame(tpot.evaluated_individuals_)

# *KDDCup09_appetency

In [None]:
path = '//home//maraks//Desktop//datasets//binary//KDDCup09_appetency.csv'
data = pd.read_csv(path)
data

In [None]:
for column in data.columns:
    if(column in ['Var191','Var192','Var193','Var194','Var195',
    'Var196','Var197','Var198','Var199','Var200','Var201','Var202',
    'Var203','Var204','Var205','Var206','Var207','Var208',
    'Var210','Var211','Var212','Var213','Var214','Var215','Var216',
    'Var217','Var218','Var219','Var220','Var221','Var222','Var223',
    'Var224','Var225','Var226','Var227','Var228','Var229','APPETENCY'] ):
        data[column] = data[column].replace('?','NaN')
        le = preprocessing.LabelEncoder()
        le.fit(data[column])
        data[column] = le.transform(data[column])
        #le.inverse_transform([0, 0, 1, 2])
        #data[column] = data[column].astype('category')      
    else:
        data[column] = data[column].replace('?',-1)
        data[column] = pd.to_numeric(data[column])

In [None]:
data

In [None]:
cluster = LocalCluster(n_workers=1,
                       threads_per_worker=2,
                       memory_limit='6GB')
client = Client(cluster)

client

In [None]:
X,y = data.iloc[:,:-1] , data.iloc[:,-1:]

X_train, X_test, y_train, y_test = train_test_split(X, y,
                train_size=0.75, test_size=0.25, random_state=42)
y_train=y_train.values.ravel()
y_test=y_test.values.ravel()

tpot = TPOTClassifier(                  
                      generations=3, 
                      population_size=5,
                      offspring_size=None, 
                      mutation_rate=0.9,
                      crossover_rate=0.1,
                      scoring='neg_log_loss',
                      config_dict=None,
                      cv=5,
                      use_dask=True,
                      n_jobs=1,
                      max_eval_time_mins=2,
                      verbosity=2, 
                      random_state=42
)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_3.py')

# Worker exceeded 95% memory budget

In [None]:
client.close()
cluster.close()

In [None]:
pd.DataFrame(tpot.evaluated_individuals_)

# *Vehicle

In [None]:
path = '//home//maraks//Desktop//datasets//binary//vehicle-binary.csv'
data = pd.read_csv(path)
data

In [None]:
data['Class'] = data['Class'].map({'other':0,'saab':1})

In [None]:
for column in data.columns:
    data[column] = pd.to_numeric(data[column])            

In [None]:
cluster = LocalCluster(n_workers=1,
                       threads_per_worker=4,
                       memory_limit='6GB')
client = Client(cluster)

client

In [None]:
X,y = data.iloc[:,:-1] , data.iloc[:,-1:]

X_train, X_test, y_train, y_test = train_test_split(X, y,
                train_size=0.75, test_size=0.25, random_state=42)
y_train=y_train.values.ravel()
y_test=y_test.values.ravel()

tpot = TPOTClassifier(                  
                      generations=3, 
                      population_size=5,
                      offspring_size=None, 
                      mutation_rate=0.9,
                      crossover_rate=0.1,
                      scoring='neg_log_loss',
                      config_dict=None,
                      cv=5,
                      use_dask=True,
                      n_jobs=1,
                      max_eval_time_mins=2,
                      verbosity=2, 
                      random_state=42
)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_3.py')

# 0.7507692307692307

In [None]:
# -0.515403
client.close()
cluster.close()

In [None]:
pd.DataFrame(tpot.evaluated_individuals_)

# Plot

In [None]:
import time
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics
from smac.tae import StatusType
import autosklearn.classification


def show_results(automl):
    def get_runhistory_models_performance(automl):
        metric = cls.automl_._metric
        data = automl.automl_.runhistory_.data
        performance_list = []
        for run_key, run_value in data.items():
            if run_value.status != StatusType.SUCCESS:
                # Ignore crashed runs
                continue
            # Alternatively, it is possible to also obtain the start time with ``run_value.starttime``
            endtime = pd.Timestamp(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(run_value.endtime)))
            val_score = metric._optimum - (metric._sign * run_value.cost)
            test_score = metric._optimum - (metric._sign * run_value.additional_info['test_loss'])
            train_score = metric._optimum - (metric._sign * run_value.additional_info['train_loss'])
            performance_list.append({
                'Timestamp': endtime,
                'single_best_optimization_score': val_score,
                'single_best_test_score': test_score,
                'single_best_train_score': train_score,
            })
        return pd.DataFrame(performance_list)

    ensemble_performance_frame = pd.DataFrame(automl.automl_.ensemble_performance_history)
    best_values = pd.Series({'ensemble_optimization_score': -np.inf,
                             'ensemble_test_score': -np.inf})
    for idx in ensemble_performance_frame.index:
        if (
            ensemble_performance_frame.loc[idx, 'ensemble_optimization_score']
            > best_values['ensemble_optimization_score']
        ):
            best_values = ensemble_performance_frame.loc[idx]
        ensemble_performance_frame.loc[idx] = best_values

    individual_performance_frame = get_runhistory_models_performance(automl)
    best_values = pd.Series({'single_best_optimization_score': -np.inf,
                             'single_best_test_score': -np.inf,
                             'single_best_train_score': -np.inf})
    for idx in individual_performance_frame.index:
        if (
            individual_performance_frame.loc[idx, 'single_best_optimization_score']
            > best_values['single_best_optimization_score']
        ):
            best_values = individual_performance_frame.loc[idx]
        individual_performance_frame.loc[idx] = best_values


    pd.merge(
        ensemble_performance_frame,
        individual_performance_frame,
        on="Timestamp", how='outer'
    ).sort_values('Timestamp').fillna(method='ffill').plot(
        x='Timestamp',
        kind='line',
        legend=True,
        title='Auto-sklearn accuracy over time',
        grid=True,
    )
    fig = plt.gcf() 
    fig.set_size_inches(15,8)
    plt.show()
    return individual_performance_frame, ensemble_performance_frame

In [None]:
# feature encoding?
#titanic['Sex'] = titanic['Sex'].map({'male':0,'female':1})
#titanic['Embarked'] = titanic['Embarked'].map({'S':0,'C':1,'Q':2})

In [None]:
# NaNs
#titanic = titanic.fillna(-999)

In [None]:
# One Hot
#from sklearn.preprocessing import MultiLabelBinarizer
#mlb = MultiLabelBinarizer()
#CabinTrans = mlb.fit_transform([{str(val)} for val in titanic['Cabin'].values])

In [None]:
#pd.get_dummies(data,dummy_na=False).values.shape