In [1]:
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [4]:
titanic = pd.read_csv('dataset/train.csv')
titanic.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
titanic.groupby('Sex').Survived.value_counts()

Sex     Survived
female  1           233
        0            81
male    0           468
        1           109
Name: Survived, dtype: int64

In [7]:
titanic.groupby(['Pclass','Sex']).Survived.value_counts()

Pclass  Sex     Survived
1       female  1            91
                0             3
        male    0            77
                1            45
2       female  1            70
                0             6
        male    0            91
                1            17
3       female  0            72
                1            72
        male    0           300
                1            47
Name: Survived, dtype: int64

In [8]:
id = pd.crosstab([titanic.Pclass, titanic.Sex], titanic.Survived.astype(float))
id.div(id.sum(1),astype(float), 0)

NameError: name 'astype' is not defined

In [9]:
titanic.rename(columns={'Survived': 'class'}, inplace=True)

In [10]:
titanic.dtypes

PassengerId      int64
class            int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [11]:
for cat in ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']:
    print("Number of levels in category '{0}': \b {1:2.2f} ".format(cat, titanic[cat].unique().size))

Number of levels in category 'Name':  891.00 
Number of levels in category 'Sex':  2.00 
Number of levels in category 'Ticket':  681.00 
Number of levels in category 'Cabin':  148.00 
Number of levels in category 'Embarked':  4.00 


In [12]:
for cat in ['Sex', 'Embarked']:
    print("Levels for catgeory '{0}': {1}".format(cat, titanic[cat].unique()))

Levels for catgeory 'Sex': ['male' 'female']
Levels for catgeory 'Embarked': ['S' 'C' 'Q' nan]


In [13]:
titanic['Sex'] = titanic['Sex'].map({'male':0,'female':1})
titanic['Embarked'] = titanic['Embarked'].map({'S':0,'C':1,'Q':2})

In [14]:
titanic = titanic.fillna(-999)
pd.isnull(titanic).any()

PassengerId    False
class          False
Pclass         False
Name           False
Sex            False
Age            False
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin          False
Embarked       False
dtype: bool

In [15]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
CabinTrans = mlb.fit_transform([{str(val)} for val in titanic['Cabin'].values])

In [16]:
CabinTrans

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ..., 
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [17]:
titanic_new = titanic.drop(['Name','Ticket','Cabin','class'], axis=1)

In [18]:
assert (len(titanic['Cabin'].unique()) == len(mlb.classes_)), "Not Equal" #check correct encoding done

In [19]:
titanic_new = np.hstack((titanic_new.values,CabinTrans))

In [20]:
np.isnan(titanic_new).any()

False

In [21]:
titanic_new[0].size

156

In [22]:
titanic_class = titanic['class'].values

In [23]:
training_indices, validation_indices = training_indices, testing_indices = train_test_split(titanic.index, stratify = titanic_class, train_size=0.75, test_size=0.25)

In [24]:
training_indices.size, validation_indices.size

(668, 223)

In [48]:
tpot = TPOTClassifier(verbosity=2, max_time_mins=10, max_eval_time_mins=0.04, population_size=40)
tpot.fit(titanic_new[training_indices], titanic_class[training_indices])

Optimization Progress: 87pipeline [00:32,  3.10pipeline/s]                  

Generation 1 - Current best internal CV score: 0.8039234071912784


Optimization Progress: 128pipeline [00:51,  1.53pipeline/s]                   

Generation 2 - Current best internal CV score: 0.8039234071912784


Optimization Progress: 171pipeline [01:13,  1.57pipeline/s]                   

Generation 3 - Current best internal CV score: 0.8039234071912784


Optimization Progress: 214pipeline [01:37,  2.74pipeline/s]                   

Generation 4 - Current best internal CV score: 0.8039234071912784


Optimization Progress: 257pipeline [01:58,  2.02pipeline/s]                   

Generation 5 - Current best internal CV score: 0.8039234071912784


Optimization Progress: 298pipeline [02:18,  2.17pipeline/s]                   

Generation 6 - Current best internal CV score: 0.8039234071912784


Optimization Progress: 338pipeline [02:45,  1.38pipeline/s]                   

Generation 7 - Current best internal CV score: 0.8069421480733343


Optimization Progress: 380pipeline [03:12,  1.50pipeline/s]                   

Generation 8 - Current best internal CV score: 0.8069421480733343


Optimization Progress: 422pipeline [03:42,  1.56pipeline/s]                   

Generation 9 - Current best internal CV score: 0.8069421480733343


Optimization Progress: 463pipeline [04:14,  1.47pipeline/s]                   

Generation 10 - Current best internal CV score: 0.8084681853888451


Optimization Progress: 506pipeline [04:48,  1.36pipeline/s]                   

Generation 11 - Current best internal CV score: 0.8084681853888451


Optimization Progress: 548pipeline [05:24,  1.09pipeline/s]                   

Generation 12 - Current best internal CV score: 0.8084681853888451


Optimization Progress: 591pipeline [06:03,  1.42pipeline/s]                   

Generation 13 - Current best internal CV score: 0.809927388953312


Optimization Progress: 632pipeline [06:38,  1.78pipeline/s]                   

Generation 14 - Current best internal CV score: 0.8114422041837596


Optimization Progress: 675pipeline [07:20,  1.23pipeline/s]                   

Generation 15 - Current best internal CV score: 0.8114422041837596


Optimization Progress: 716pipeline [07:56,  1.19pipeline/s]                   

Generation 16 - Current best internal CV score: 0.8143384996487903


Optimization Progress: 758pipeline [08:34,  1.00s/pipeline]                   

Generation 17 - Current best internal CV score: 0.8293085948702602


Optimization Progress: 801pipeline [09:14,  1.27s/pipeline]                   

Generation 18 - Current best internal CV score: 0.8293085948702602


Optimization Progress: 845pipeline [09:52,  1.25pipeline/s]

Generation 19 - Current best internal CV score: 0.8293085948702602


                                                           


10.004594866666668 minutes have elapsed. TPOT will close down.
TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: GradientBoostingClassifier(RandomForestClassifier(SelectFwe(input_matrix, alpha=0.018), bootstrap=True, criterion=gini, max_features=0.6, min_samples_leaf=1, min_samples_split=15, n_estimators=100), learning_rate=0.01, max_depth=8, max_features=0.5, min_samples_leaf=3, min_samples_split=3, n_estimators=100, subsample=0.6)


TPOTClassifier(config_dict={'sklearn.naive_bayes.GaussianNB': {}, 'sklearn.naive_bayes.BernoulliNB': {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], 'fit_prior': [True, False]}, 'sklearn.naive_bayes.MultinomialNB': {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], 'fit_prior': [True, False]}, 'sklearn.tree.DecisionT....45,
        0.5 ,  0.55,  0.6 ,  0.65,  0.7 ,  0.75,  0.8 ,  0.85,  0.9 ,
        0.95,  1.  ])}}}},
        crossover_rate=0.1, cv=5, disable_update_check=False,
        early_stop=None, generations=1000000, max_eval_time_mins=0.04,
        max_time_mins=10, memory=None, mutation_rate=0.9, n_jobs=1,
        offspring_size=40, periodic_checkpoint_folder=None,
        population_size=40, random_state=None, scoring=None, subsample=1.0,
        verbosity=2, warm_start=False)

In [26]:
tpot.score(titanic_new[validation_indices], titanic.loc[validation_indices, 'class'].values)

0.82511210762331844

In [27]:
tpot.export('tpot_titanic_pipeline.py')

True

In [None]:
# %load tpot_titanic_pipeline.py
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.8083566295506595
exported_pipeline = RandomForestClassifier(bootstrap=False, criterion="entropy", max_features=0.45, min_samples_leaf=12, min_samples_split=5, n_estimators=100)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)


In [33]:
titanic_sub = pd.read_csv('dataset/test.csv')
titanic_sub.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [35]:
for var in ['Cabin']: #,'Name','Ticket']:
    new = list(set(titanic_sub[var]) - set(titanic[var]))
    titanic_sub.loc[titanic_sub[var].isin(new), var] = -999

In [36]:
titanic_sub['Sex'] = titanic_sub['Sex'].map({'male':0,'female':1})
titanic_sub['Embarked'] = titanic_sub['Embarked'].map({'S':0,'C':1,'Q':2})

In [37]:
titanic_sub = titanic_sub.fillna(-999)
pd.isnull(titanic_sub).any()

PassengerId    False
Pclass         False
Name           False
Sex            False
Age            False
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin          False
Embarked       False
dtype: bool

In [38]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
SubCabinTrans = mlb.fit([{str(val)} for val in titanic['Cabin'].values]).transform([{str(val)} for val in titanic_sub['Cabin'].values])
titanic_sub = titanic_sub.drop(['Name','Ticket','Cabin'], axis=1)

In [39]:
# Form the new submission data set
titanic_sub_new = np.hstack((titanic_sub.values,SubCabinTrans))

In [40]:
np.any(np.isnan(titanic_sub_new))

False

In [41]:
assert (titanic_new.shape[1] == titanic_sub_new.shape[1]), "Not Equal"

In [42]:
submission = tpot.predict(titanic_sub_new)

In [44]:
# Create the submission file
final = pd.DataFrame({'PassengerId': titanic_sub['PassengerId'], 'Survived': submission})
final.to_csv('dataset/submission.csv', index = False)

In [45]:
final.shape

(418, 2)

In [46]:
final.head(5)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [47]:
titanic_new.head(5)

AttributeError: 'numpy.ndarray' object has no attribute 'head'