In [1]:
import pandas as pd 

# Import data to pandas dataframe 

dataframe = pd.read_csv("abalone.data", header=None)
dataframe.columns = ['sex', 'length', 'diameter', 'height', 'whole_weight', 'shucked_weight', 'viscera_weight', "shell_weight", "rings"]
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sex             4177 non-null   object 
 1   length          4177 non-null   float64
 2   diameter        4177 non-null   float64
 3   height          4177 non-null   float64
 4   whole_weight    4177 non-null   float64
 5   shucked_weight  4177 non-null   float64
 6   viscera_weight  4177 non-null   float64
 7   shell_weight    4177 non-null   float64
 8   rings           4177 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB


In [2]:
def replace_sex(sex):
    return {
        'M' : 0, 
        'F' : 1, 
        'I' : 2
    }[sex]

dataframe["sex"] = dataframe["sex"].apply(replace_sex)

In [3]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sex             4177 non-null   int64  
 1   length          4177 non-null   float64
 2   diameter        4177 non-null   float64
 3   height          4177 non-null   float64
 4   whole_weight    4177 non-null   float64
 5   shucked_weight  4177 non-null   float64
 6   viscera_weight  4177 non-null   float64
 7   shell_weight    4177 non-null   float64
 8   rings           4177 non-null   int64  
dtypes: float64(7), int64(2)
memory usage: 293.8 KB


In [3]:
# Split data to training (80%) and test(20%) datasets 
from sklearn.model_selection import train_test_split
X = dataframe.drop(columns=['sex'])
y = dataframe['sex']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [5]:
# Launch tpot training 
from tpot import TPOTClassifier
clf_tpot = TPOTClassifier(verbosity=2, max_time_mins=30 )
clf_tpot.fit(X_train, y_train)




Generation 1 - Current best internal CV score: 0.5576175004251587

Generation 2 - Current best internal CV score: 0.5653911907127449

Generation 3 - Current best internal CV score: 0.5653911907127449

Generation 4 - Current best internal CV score: 0.5674825237417542

30.05 minutes have elapsed. TPOT will close down.
TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: RandomForestClassifier(PCA(PolynomialFeatures(input_matrix, degree=2, include_bias=False, interaction_only=False), iterated_power=10, svd_solver=randomized), bootstrap=False, criterion=entropy, max_features=0.5, min_samples_leaf=17, min_samples_split=15, n_estimators=100)


TPOTClassifier(max_time_mins=30, verbosity=2)

In [6]:
# Launch autosklearn trainning  
import autosklearn.classification
clf_autosklearn = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task=1800)
clf_autosklearn.fit(X_train, y_train)

AutoSklearnClassifier(per_run_time_limit=180, time_left_for_this_task=1800)

In [7]:
# launch oboe 
from oboe import AutoLearner, error
import numpy as np 
x_train = np.array(X_train)
yy_train = np.array(y_train)
method = 'Oboe' # 'Oboe' or 'TensorOboe'
problem_type = 'classification'
clf_oboe = AutoLearner(p_type=problem_type, runtime_limit=30, method=method, verbose=False)
clf_oboe.fit(x_train, yy_train)

{'ranks': [8, 9, 9, 9],
 'runtime_limits': [1, 2, 4, 8],
 'validation_loss': [0.5,
  0.3286941699167915,
  0.33691157736931254,
  0.33918316964904455,
  0.33918316964904455],
 'filled_new_row': [array([[ 0.39372691,  0.39501993,  0.50742336,  0.6134224 ,  0.57937344,
           0.3879399 ,  0.37254254,  0.46335424,  0.55098358,  0.52576061,
          -0.05971301, -0.04785633, -0.02686016,  0.00115114,  0.06714865,
           0.15868824,  0.26295012,  0.37418516,  0.36291493,  0.45443982,
          -0.02866512, -0.05850159, -0.05971301, -0.05971301, -0.07198528,
          -0.06827558,  0.34013066, -0.07405274, -0.05971642, -0.05577378,
          -0.03014716, -0.02120869,  0.03331975,  0.03849945,  0.10155652,
           0.10945455,  0.179687  ,  0.19285457,  0.27389476,  0.28385539,
           0.29335054,  0.29797858,  0.37462968,  0.37439359,  0.10108239,
           0.11416551, -0.04654132, -0.04275572,  0.34346831, -0.0708266 ,
          -0.07198528, -0.06827558, -0.07198528, -0.06827

In [8]:
from sklearn.metrics import accuracy_score

# Compute  score on test dataset 
print("tpot score:\t\t ", clf_tpot.score(X_test, y_test))
print("autosklern score:\t\t ", clf_autosklearn.score(X_test, y_test))

x_test = np.array(X_test)
yy_test = np.array(y_test)
y_predicted = clf_oboe.predict(x_test)
y_predicted = y_predicted.reshape((yy_test.size,))
print("oboe score:\t\t ", accuracy_score(y_test, y_predicted) )

tpot score:		  0.5729665071770335
autosklern score:		  0.5729665071770335
oboe score:		  0.5633971291866029


In [12]:
# Save models 
import pickle
pickle.dump(clf_autosklearn, open('autosklearn_abalone.sav', 'wb'))
pickle.dump(clf_oboe, open('oboe_abalone.sav', 'wb'))
# pickle.dump(clf_tpot, open('tpot_abalone.sav', 'wb'))
clf_tpot.export('tpot_abalone.sav')

In [33]:
clf_oboe.get_models() 

{'ensemble method': 'select at most 5 pipelines with smallest cv error',
 'base learners': {'ExtraTrees': [{'min_samples_split': 4,
    'criterion': 'entropy'},
   {'min_samples_split': 4, 'criterion': 'gini'},
   {'min_samples_split': 4, 'criterion': 'gini'},
   {'min_samples_split': 4, 'criterion': 'gini'},
   {'min_samples_split': 4, 'criterion': 'gini'}]}}

In [46]:
# Soft voting ensemble learning (just with tpot and autosklearn)

# Get probabilites for ensemble learning 
tpot_sklearn_prob = clf_tpot.predict_proba(X_test)
auto_sklearn_prob = clf_autosklearn.predict_proba(X_test)

# Construct dataframe for soft vote 
AutoSklearn_Class=[]
AutoSklearn_Prob= []
TPOT_Class=[]
TPOT_Prob= []
for  value in auto_sklearn_prob :
    AutoSklearn_Class.append(int(np.argmax(value)))
    AutoSklearn_Prob.append(float(value[np.argmax(value)]))

for  value in tpot_sklearn_prob :
    TPOT_Class.append(int(np.argmax(value)))
    TPOT_Prob.append(float(value[np.argmax(value)])) 

prob = pd.DataFrame(list(zip(AutoSklearn_Class, AutoSklearn_Prob, TPOT_Class, TPOT_Prob)), columns=[ "AutoSklearn_Class", "AutoSklearn_Prob", "TPOT_Class", "TPOT_Prob"])

In [87]:
prob

Unnamed: 0,AutoSklearn_Class,AutoSklearn_Prob,TPOT_Class,TPOT_Prob
0,1,0.352673,1,0.562626
1,2,0.368335,2,0.780194
2,1,0.347295,1,0.524657
3,0,0.337473,0,0.360395
4,0,0.346605,0,0.472771
...,...,...,...,...
831,2,0.375148,2,0.786533
832,1,0.351968,1,0.568500
833,1,0.348782,1,0.535069
834,2,0.374740,2,0.882562


In [91]:
# Add column ensemble (== T if Tpot is more sure about his prediction and A otherwise )
prob['ensemble'] = prob.apply(lambda x : "T" if x['AutoSklearn_Prob']<= x['TPOT_Prob'] else "A", axis=1)

In [93]:
prob["ensemble"].unique()
# !!! TPOT is always more sure about his predictions :/ 

array(['T'], dtype=object)

In [95]:
# Add prediction column (the class predicted by the more sure classifier )
prob['pred'] = prob.apply(lambda x : x['TPOT_Class'] if x['ensemble'] == 'T' else x['AutoSklearn_Class'], axis=1)

In [97]:
accuracy_score(y_test, prob['pred'])
# tpot score:		  0.5729665071770335 

0.5729665071770335