# 13. Machine learning techniques

In [1]:
import pandas
import matplotlib.pyplot as plt

In [2]:
import random as rd
rd.seed(0)

## 13.1 Loading and exploring the dataset

First, we use pandas to load the dataset from a csv file.

In [3]:
raw_data = pandas.read_csv('./titanic.csv')
raw_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


Next, we can explore the dataset.

In [4]:
# Examining the length of the dataset
print("The dataset has", len(raw_data), "rows")

The dataset has 891 rows


In [5]:
# Examining the columns in the dataset
print("Columns (features of the dataset)")
raw_data.columns

Columns (features of the dataset)


Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [6]:
# Examining the labels
print("Labels")
raw_data["Survived"]

Labels


0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [7]:
# Examining how many passengers survived
print(sum(raw_data['Survived']),'passengers survived out of',len(raw_data))

342 passengers survived out of 891


In [8]:
# One can look at several columns together
raw_data[["Name", "Age"]]

Unnamed: 0,Name,Age
0,"Braund, Mr. Owen Harris",22.0
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0
2,"Heikkinen, Miss. Laina",26.0
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0
4,"Allen, Mr. William Henry",35.0
...,...,...
886,"Montvila, Rev. Juozas",27.0
887,"Graham, Miss. Margaret Edith",19.0
888,"Johnston, Miss. Catherine Helen ""Carrie""",
889,"Behr, Mr. Karl Howell",26.0


## 13.2. Cleaning up the data

Now, let's look at how many columns have missing data

In [9]:
raw_data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

The Cabin column is missing too many values to be useful. Let's drop it altogether.

In [10]:
raw_data['Cabin']

0       NaN
1       C85
2       NaN
3      C123
4       NaN
       ... 
886     NaN
887     B42
888     NaN
889    C148
890     NaN
Name: Cabin, Length: 891, dtype: object

In [11]:
print("The Cabin column is missing", sum(raw_data['Cabin'].isna()), "values out of",len(raw_data['Cabin']))

The Cabin column is missing 687 values out of 891


In [12]:
clean_data = raw_data.drop('Cabin', axis=1)

In [13]:
clean_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


Other columns such as Age or Embarked are missing some values, but they can still be useful.

For the age column, let's fill in the missing values with the median of all ages.

For the Embarked column, let's make a new category called 'U', for Unknown port of embarkment.

In [14]:
clean_data['Age']

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64

In [15]:
median_age = raw_data["Age"].median()
median_age

28.0

In [16]:
clean_data["Age"] = clean_data["Age"].fillna(median_age)

In [17]:
clean_data["Embarked"] = clean_data["Embarked"].fillna('U')

In [18]:
clean_data.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [19]:
clean_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


### 12.2.3 Saving our data for the future

In [20]:
clean_data.to_csv('./clean_titanic_data.csv', index=None)

## 12.3 Manipulating the features

- One-hot encoding
- Binning
- Feature selection

### 13.3.1 One-hot encoding

In [21]:
preprocessed_data = pandas.read_csv('clean_titanic_data.csv')
preprocessed_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


One-hot encoding the gender feature

In [22]:
gender_columns = pandas.get_dummies(preprocessed_data['Sex'], prefix='Sex')
print(gender_columns)
embarked_columns = pandas.get_dummies(preprocessed_data["Embarked"], prefix="Embarked")
print(embarked_columns)

     Sex_female  Sex_male
0             0         1
1             1         0
2             1         0
3             1         0
4             0         1
..          ...       ...
886           0         1
887           1         0
888           1         0
889           0         1
890           0         1

[891 rows x 2 columns]
     Pclass_C  Pclass_Q  Pclass_S  Pclass_U
0           0         0         1         0
1           1         0         0         0
2           0         0         1         0
3           0         0         1         0
4           0         0         1         0
..        ...       ...       ...       ...
886         0         0         1         0
887         0         0         1         0
888         0         0         1         0
889         1         0         0         0
890         0         1         0         0

[891 rows x 4 columns]


In [23]:
preprocessed_data = pandas.concat([preprocessed_data, gender_columns], axis=1)
preprocessed_data = pandas.concat([preprocessed_data, embarked_columns], axis=1)

In [24]:
preprocessed_data = preprocessed_data.drop(['Sex', 'Embarked'], axis=1)

In [25]:
preprocessed_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Sex_female,Sex_male,Pclass_C,Pclass_Q,Pclass_S,Pclass_U
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,0,1,0,0,1,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,1,0,1,0,0,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,1,0,0,0,1,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,1,0,0,0,1,0
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,0,1,0,0,1,0


### A rule of thumb for when to one-hot encode or not

In [26]:
class_survived = preprocessed_data[['Pclass', 'Survived']]

first_class = class_survived[class_survived['Pclass'] == 1]
second_class = class_survived[class_survived['Pclass'] == 2]
third_class = class_survived[class_survived['Pclass'] == 3]

print("In first class", sum(first_class['Survived'])/len(first_class)*100, "% of passengers survived")
print("In second class", sum(second_class['Survived'])/len(first_class)*100, "% of passengers survived")
print("In third class", sum(third_class['Survived'])/len(first_class)*100, "% of passengers survived")

In first class 62.96296296296296 % of passengers survived
In second class 40.27777777777778 % of passengers survived
In third class 55.092592592592595 % of passengers survived


In [27]:
categorized_pclass_columns = pandas.get_dummies(preprocessed_data['Pclass'], prefix='Pclass')
preprocessed_data = pandas.concat([preprocessed_data, categorized_pclass_columns], axis=1)
preprocessed_data = preprocessed_data.drop(['Pclass'], axis=1)

In [28]:
preprocessed_data.head()

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Sex_female,Sex_male,Pclass_C,Pclass_Q,Pclass_S,Pclass_U,Pclass_1,Pclass_2,Pclass_3
0,1,0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,0,1,0,0,1,0,0,0,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,1,0,1,0,0,0,1,0,0
2,3,1,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,1,0,0,0,1,0,0,0,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,1,0,0,0,1,0,1,0,0
4,5,0,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,0,1,0,0,1,0,0,0,1


### 13.3.3 Binning

In [29]:
bins = [0, 10, 20, 30, 40, 50, 60, 70, 80]
categorized_age = pandas.cut(preprocessed_data['Age'], bins)
preprocessed_data['Categorized_age'] = categorized_age
preprocessed_data = preprocessed_data.drop(["Age"], axis=1)

In [30]:
preprocessed_data.head()

Unnamed: 0,PassengerId,Survived,Name,SibSp,Parch,Ticket,Fare,Sex_female,Sex_male,Pclass_C,Pclass_Q,Pclass_S,Pclass_U,Pclass_1,Pclass_2,Pclass_3,Categorized_age
0,1,0,"Braund, Mr. Owen Harris",1,0,A/5 21171,7.25,0,1,0,0,1,0,0,0,1,"(20, 30]"
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,0,PC 17599,71.2833,1,0,1,0,0,0,1,0,0,"(30, 40]"
2,3,1,"Heikkinen, Miss. Laina",0,0,STON/O2. 3101282,7.925,1,0,0,0,1,0,0,0,1,"(20, 30]"
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,0,113803,53.1,1,0,0,0,1,0,1,0,0,"(30, 40]"
4,5,0,"Allen, Mr. William Henry",0,0,373450,8.05,0,1,0,0,1,0,0,0,1,"(30, 40]"


In [31]:
cagegorized_age_columns = pandas.get_dummies(preprocessed_data['Categorized_age'], prefix='Categorized_age')
preprocessed_data = pandas.concat([preprocessed_data, cagegorized_age_columns], axis=1)
preprocessed_data = preprocessed_data.drop(['Categorized_age'], axis=1)

In [32]:
preprocessed_data.head()

Unnamed: 0,PassengerId,Survived,Name,SibSp,Parch,Ticket,Fare,Sex_female,Sex_male,Pclass_C,...,Pclass_2,Pclass_3,"Categorized_age_(0, 10]","Categorized_age_(10, 20]","Categorized_age_(20, 30]","Categorized_age_(30, 40]","Categorized_age_(40, 50]","Categorized_age_(50, 60]","Categorized_age_(60, 70]","Categorized_age_(70, 80]"
0,1,0,"Braund, Mr. Owen Harris",1,0,A/5 21171,7.25,0,1,0,...,0,1,0,0,1,0,0,0,0,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,0,PC 17599,71.2833,1,0,1,...,0,0,0,0,0,1,0,0,0,0
2,3,1,"Heikkinen, Miss. Laina",0,0,STON/O2. 3101282,7.925,1,0,0,...,0,1,0,0,1,0,0,0,0,0
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,0,113803,53.1,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,0,"Allen, Mr. William Henry",0,0,373450,8.05,0,1,0,...,0,1,0,0,0,1,0,0,0,0


### 13.3.3 Feature selection

In [33]:
preprocessed_data = preprocessed_data.drop(['Name', 'Ticket', 'PassengerId'], axis=1)

In [34]:
preprocessed_data.head()

Unnamed: 0,Survived,SibSp,Parch,Fare,Sex_female,Sex_male,Pclass_C,Pclass_Q,Pclass_S,Pclass_U,...,Pclass_2,Pclass_3,"Categorized_age_(0, 10]","Categorized_age_(10, 20]","Categorized_age_(20, 30]","Categorized_age_(30, 40]","Categorized_age_(40, 50]","Categorized_age_(50, 60]","Categorized_age_(60, 70]","Categorized_age_(70, 80]"
0,0,1,0,7.25,0,1,0,0,1,0,...,0,1,0,0,1,0,0,0,0,0
1,1,1,0,71.2833,1,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,1,0,0,7.925,1,0,0,0,1,0,...,0,1,0,0,1,0,0,0,0,0
3,1,1,0,53.1,1,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,8.05,0,1,0,0,1,0,...,0,1,0,0,0,1,0,0,0,0


### 12.3.5 Saving for future use

In [35]:
preprocessed_data.to_csv('preprocessed_titanic_data.csv', index=None)

# 13.4 Training models

In [36]:
data = pandas.read_csv('./preprocessed_titanic_data.csv')
data.head()

Unnamed: 0,Survived,SibSp,Parch,Fare,Sex_female,Sex_male,Pclass_C,Pclass_Q,Pclass_S,Pclass_U,...,Pclass_2,Pclass_3,"Categorized_age_(0, 10]","Categorized_age_(10, 20]","Categorized_age_(20, 30]","Categorized_age_(30, 40]","Categorized_age_(40, 50]","Categorized_age_(50, 60]","Categorized_age_(60, 70]","Categorized_age_(70, 80]"
0,0,1,0,7.25,0,1,0,0,1,0,...,0,1,0,0,1,0,0,0,0,0
1,1,1,0,71.2833,1,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,1,0,0,7.925,1,0,0,0,1,0,...,0,1,0,0,1,0,0,0,0,0
3,1,1,0,53.1,1,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,8.05,0,1,0,0,1,0,...,0,1,0,0,0,1,0,0,0,0


### 13.4.1 Features-labels split and train-validation split

In [37]:
features = data.drop(["Survived"], axis=1)
labels = data["Survived"]

In [38]:
from sklearn.model_selection import train_test_split

In [39]:
# remark: we fix random_state the end, to make sure we always get the same split
features_train, features_validation_test, labels_train, labels_validation_test = train_test_split(
    features, labels, test_size=0.4, random_state=100)

In [40]:
features_validation, features_test, labels_validation, labels_test = train_test_split(
    features_validation_test, labels_validation_test, test_size=0.5, random_state=100)

In [41]:
print(len(features_train))
print(len(features_validation))
print(len(features_test))
print(len(labels_train))
print(len(labels_validation))
print(len(labels_test))

534
178
179
534
178
179


### 13.4.2 Training different models on our dataset

We'll train four models:
- Logistic regression (perceptron)
- Decision tree
- Naive Bayes
- Support vector machine (SVM)

In [42]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression()
lr_model.fit(features_train, labels_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [43]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()
dt_model.fit(features_train, labels_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [44]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(features_train, labels_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [45]:
from sklearn.svm import SVC

svm_model = SVC()
svm_model.fit(features_train, labels_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [46]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(features_train, labels_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [47]:
from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier()
gb_model.fit(features_train, labels_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [48]:
from sklearn.ensemble import AdaBoostClassifier

ab_model = AdaBoostClassifier()
ab_model.fit(features_train, labels_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None)

### 13.4.3 Evaluating the models

#### Accuracy

In [49]:
print("Scores of the models")
print("Logistic regression:", lr_model.score(features_validation, labels_validation))
print("Decision tree:", dt_model.score(features_validation, labels_validation))
print("Naive Bayes:", nb_model.score(features_validation, labels_validation))
print("SVM:", svm_model.score(features_validation, labels_validation))
print("Random forest:", rf_model.score(features_validation, labels_validation))
print("Gradient boosting:", gb_model.score(features_validation, labels_validation))
print("AdaBoost:", ab_model.score(features_validation, labels_validation))

Scores of the models
Logistic regression: 0.7696629213483146
Decision tree: 0.7865168539325843
Naive Bayes: 0.7471910112359551
SVM: 0.6797752808988764
Random forest: 0.7808988764044944
Gradient boosting: 0.8089887640449438
AdaBoost: 0.7640449438202247


#### F1-score

In [50]:
from sklearn.metrics import f1_score

print("F1-scores of the models:")

lr_predicted_labels = lr_model.predict(features_validation)
print("Logistic regression:", f1_score(labels_validation, lr_predicted_labels))

dt_predicted_labels = dt_model.predict(features_validation)
print("Decision Tree:", f1_score(labels_validation, dt_predicted_labels))

nb_predicted_labels = nb_model.predict(features_validation)
print("Naive Bayes:", f1_score(labels_validation, nb_predicted_labels))

svm_predicted_labels = svm_model.predict(features_validation)
print("Support Vector Machine:", f1_score(labels_validation, svm_predicted_labels))

rf_predicted_labels = rf_model.predict(features_validation)
print("Random Forest:", f1_score(labels_validation, rf_predicted_labels))

gb_predicted_labels = gb_model.predict(features_validation)
print("Gradient boosting:", f1_score(labels_validation, gb_predicted_labels))

ab_predicted_labels = ab_model.predict(features_validation)
print("AdaBoost:", f1_score(labels_validation, ab_predicted_labels))

F1-scores of the models:
Logistic regression: 0.6870229007633588
Decision Tree: 0.7205882352941176
Naive Bayes: 0.6808510638297872
Support Vector Machine: 0.39999999999999997
Random Forest: 0.7153284671532848
Gradient boosting: 0.7384615384615385
AdaBoost: 0.6865671641791045


### 13.4.4 Testing the model

Finding the accuracy and the F1-score of the model in the testing set.

In [51]:
gb_model.score(features_test, labels_test)

0.8324022346368715

In [52]:
gb_predicted_test_labels = gb_model.predict(features_test)
f1_score(labels_test, gb_predicted_test_labels)

0.8026315789473685

# 13.5 Grid search

In [53]:
from sklearn.model_selection import GridSearchCV

In [54]:
# Grid search with an rbf kernel

print("SVM grid search with a radial basis function kernel")

# rbf, C=1, gamma=0.1
svm_1_01 = SVC(kernel='rbf', C=1, gamma=0.1)
svm_1_01.fit(features_train, labels_train)
print("C=1, gamma=0.1", svm_1_01.score(features_validation, labels_validation))

# rbf, C=1, gamma=1
svm_1_1 = SVC(kernel='rbf', C=1, gamma=1)
svm_1_1.fit(features_train, labels_train)
print("C=1, gamma=1", svm_1_1.score(features_validation, labels_validation))

# rbf, C=1, gamma=10
svm_1_10 = SVC(kernel='rbf', C=1, gamma=10)
svm_1_10.fit(features_train, labels_train)
print("C=1, gamma=10", svm_1_10.score(features_validation, labels_validation))

# rbf, C=10, gamma=0.1
svm_10_01 = SVC(kernel='rbf', C=10, gamma=0.1)
svm_10_01.fit(features_train, labels_train)
print("C=10, gamma=0.1", svm_10_01.score(features_validation, labels_validation))

# rbf, C=10, gamma=1
svm_10_1 = SVC(kernel='rbf', C=10, gamma=1)
svm_10_1.fit(features_train, labels_train)
print("C=10, gamma=1", svm_10_1.score(features_validation, labels_validation))

# rbf, C=10, gamma=10
svm_10_10 = SVC(kernel='rbf', C=10, gamma=10)
svm_10_10.fit(features_train, labels_train)
print("C=10, gamma=10", svm_10_10.score(features_validation, labels_validation))

SVM grid search with a radial basis function kernel
C=1, gamma=0.1 0.702247191011236
C=1, gamma=1 0.6966292134831461
C=1, gamma=10 0.6685393258426966
C=10, gamma=0.1 0.7247191011235955
C=10, gamma=1 0.6910112359550562
C=10, gamma=10 0.651685393258427


In [57]:
svm_parameters = {'kernel': ['rbf'],
                  'C': [0.01, 0.1, 1 , 10, 100],
                  'gamma': [0.01, 0.1, 1, 10, 100]
                }
svm = SVC()
svm_gs = GridSearchCV(estimator = svm,
                      param_grid = svm_parameters)
svm_gs.fit(features_train, labels_train)

svm_winner = svm_gs.best_estimator_
svm_winner

svm_winner.score(features_validation, labels_validation)

0.7191011235955056

In [58]:
svm_winner

SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

# 13.6 Cross validation

In [59]:
svm_gs.cv_results_

{'mean_fit_time': array([0.01245098, 0.01030803, 0.00838737, 0.00820799, 0.00729985,
        0.00674162, 0.00719352, 0.00772071, 0.00829163, 0.00733361,
        0.00742478, 0.00799875, 0.00788393, 0.0083982 , 0.00779495,
        0.00884881, 0.00944495, 0.00974836, 0.01011376, 0.00908861,
        0.01280479, 0.01528444, 0.0100183 , 0.01088009, 0.00922441]),
 'std_fit_time': array([1.88711214e-03, 8.47369147e-04, 3.67374713e-04, 2.82492652e-04,
        5.12734005e-04, 9.25256522e-05, 2.51734588e-04, 1.84984057e-04,
        2.77642775e-04, 2.09819056e-04, 4.43291643e-04, 3.61642128e-04,
        1.16118278e-04, 1.69606935e-04, 1.07591899e-04, 2.96022397e-04,
        7.69834372e-04, 7.44023667e-04, 5.49712353e-04, 3.40014004e-04,
        1.61032865e-03, 2.80923803e-03, 4.40516155e-04, 5.23633501e-04,
        6.33528242e-04]),
 'mean_score_time': array([0.0043118 , 0.00286026, 0.00230074, 0.00219541, 0.00186715,
        0.00189233, 0.00194263, 0.00219169, 0.00228682, 0.00195198,
        0.00

# Exercise 13.1

In [102]:
test_data = pandas.read_csv('test.csv')

In [103]:
test_data.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [104]:
# Cleaning the data
test_data = test_data.drop('Cabin', axis=1)
test_data["Age"] = test_data["Age"].fillna(28.0)

# Catch! The test data has one missing fare. Let's fix that
average_fare = test_data["Fare"].mean()
test_data['Fare'] = test_data['Fare'].fillna(average_fare)

# Preprocessing the data
test_gender_columns = pandas.get_dummies(test_data['Sex'], prefix='Sex')
test_embarked_columns = pandas.get_dummies(test_data["Embarked"], prefix="Embarked")
test_data = pandas.concat([test_data, test_gender_columns], axis=1)
test_data = pandas.concat([test_data, test_embarked_columns], axis=1)
test_data = test_data.drop(['Sex', 'Embarked'], axis=1)

# Another small catch, the test data has no missing 'Embarked' fields. Therefore, the processed test data will not
# have an 'Embarked_Q' column. We need to artificially add one filled with zeros.
test_data['Embarked_U'] = pandas.DataFrame([0 for i in range(len(test_data))])

test_categorized_pclass_columns = pandas.get_dummies(test_data['Pclass'], prefix='Pclass')
test_data = pandas.concat([test_data, test_categorized_pclass_columns], axis=1)
test_data = test_data.drop(['Pclass'], axis=1)

bins = [0, 10, 20, 30, 40, 50, 60, 70, 80]
test_categorized_age = pandas.cut(test_data['Age'], bins)
test_data['Categorized_age'] = categorized_age
test_data = test_data.drop(["Age"], axis=1)

test_cagegorized_age_columns = pandas.get_dummies(test_data['Categorized_age'], prefix='Categorized_age')
test_data = pandas.concat([test_data, test_cagegorized_age_columns], axis=1)
test_data = test_data.drop(['Categorized_age'], axis=1)

test_data = test_data.drop(['Name', 'Ticket', 'PassengerId'], axis=1)
test_data

Unnamed: 0,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_U,Pclass_1,Pclass_2,Pclass_3,"Categorized_age_(0, 10]","Categorized_age_(10, 20]","Categorized_age_(20, 30]","Categorized_age_(30, 40]","Categorized_age_(40, 50]","Categorized_age_(50, 60]","Categorized_age_(60, 70]","Categorized_age_(70, 80]"
0,0,0,7.8292,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0
1,1,0,7.0000,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0
2,0,0,9.6875,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0
3,0,0,8.6625,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0
4,1,1,12.2875,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,0,0,8.0500,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0
414,0,0,108.9000,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0
415,0,0,7.2500,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0
416,0,0,8.0500,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0


Now, to check how many survivors were predicted by each model

In [105]:
# Logistic regression
sum(lr_model.predict(test_data))

153

In [106]:
# Decision tree
sum(dt_model.predict(test_data))

163

In [107]:
# Naive Bayes
sum(nb_model.predict(test_data))

195

In [108]:
# Support vector machine
sum(svm_model.predict(test_data))

61

In [109]:
# Random forest
sum(rf_model.predict(test_data))

154

In [110]:
# Gradient boosting
sum(gb_model.predict(test_data))

156

In [111]:
# AdaBoost
sum(ab_model.predict(test_data))

155

Since the three strongest models in terms of accuracy were random forests, gradient boosting, and adaboost, and they predicted that 154, 156, and 155 passengers survived out of the 418 in the test set, a good estimate for the number of survivors is the average of these three predictions, or 155.