# TPOT Experiments

In [27]:
import pandas as pd
from tpot import TPOTClassifier
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

In [21]:
features = ['Gender_Female', 'Customer Type_Loyal Customer',
       'Type of Travel_Business travel',
       'Type of Travel_Personal Travel', 'Class_Business',
       'Class_Eco', 'Age',
       'Flight Distance', 'Departure Delay in Minutes',
       'Arrival Delay in Minutes', 'Inflight wifi service',
       'Departure/Arrival time convenient',
       'Ease of Online booking', 'Gate location',
       'Food and drink', 'Online boarding',
       'Seat comfort', 'Inflight entertainment',
       'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service',
       'Inflight service', 'Cleanliness']

In [29]:
from tpot import TPOTClassifier

# TPOT on Original Data (without preprocess and feature selection)

In [52]:
from tpot.config import classifier_config_dict

In [38]:
df_origin_train = pd.read_csv('../data/airline_train.csv').dropna()
df_origin_val = pd.read_csv('../data/airline_test.csv')

In [40]:
df_origin_train

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103899,103899,94171,Female,disloyal Customer,23,Business travel,Eco,192,2,1,...,2,3,1,4,2,3,2,3,0.0,neutral or dissatisfied
103900,103900,73097,Male,Loyal Customer,49,Business travel,Business,2347,4,4,...,5,5,5,5,5,5,4,0,0.0,satisfied
103901,103901,68825,Male,disloyal Customer,30,Business travel,Business,1995,1,1,...,4,3,2,4,5,5,4,7,14.0,neutral or dissatisfied
103902,103902,54173,Female,disloyal Customer,22,Business travel,Eco,1000,1,1,...,1,4,5,1,5,4,1,0,0.0,neutral or dissatisfied


In [44]:
le = LabelEncoder()
categorical_cols = ['Gender', 'Customer Type', 'Type of Travel', 'Class']

df_origin_train_d = pd.get_dummies(df_origin_train[categorical_cols])
df_origin_train = pd.concat([df_origin_train, df_origin_train_d], axis=1).drop(columns = categorical_cols)
df_origin_val_d = pd.get_dummies(df_origin_val[categorical_cols])
df_origin_val = pd.concat([df_origin_val, df_origin_val_d], axis=1).drop(columns = categorical_cols)

X_origin_train = df_origin_train.drop(columns = ['satisfaction'])
y_origin_train = df_origin_train['satisfaction']
y_origin_train = le.fit_transform(y_origin_train)

X_origin_val = df_origin_val.drop(columns = ['satisfaction'])
y_origin_val = df_origin_val['satisfaction']
y_origin_val = le.fit_transform(y_origin_val)

X_origin_train.shape, X_origin_val.shape, y_origin_train.shape, y_origin_val.shape

((103594, 38), (25976, 38), (103594,), (25976,))

In [53]:
classifier_config_dict['tpot.builtins.FeatureSetSelector'] = {
    'subset_list': ['https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/subset_test.csv'],
    'sel_subset': [0,1] # select only one feature set, a list of index of subset in the list above
    #'sel_subset': list(combinations(range(3), 2)) # select two feature sets
}

In [55]:
tpot2 = TPOTClassifier(generations=2, population_size=10, verbosity=2, 
                       #template='FeatureSetSelector-Transformer-Classifier',
                       template = 'Selector-Classifier',
                       config_dict=classifier_config_dict) #default: 100 generations with 100 population size
tpot2.fit(X_origin_train, y_origin_train)

Optimization Progress:   0%|          | 0/30 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.9619669325623924

Generation 2 - Current best internal CV score: 0.9619669325623924

Best pipeline: ExtraTreesClassifier(VarianceThreshold(input_matrix, threshold=0.0001), bootstrap=False, criterion=entropy, max_features=0.4, min_samples_leaf=10, min_samples_split=18, n_estimators=100)


In [56]:
tpot.fitted_pipeline_

In [59]:
print("Accuracy on validation set is {}%".format(tpot2.score(X_origin_val, y_origin_val)*100))

Imputing missing values in feature set
Accuracy on validation set is 96.21958731136434%


In [58]:
tpot2.export('tpot_pipeline_origin_dataset.py')