# Titanic survival - optimising models with grid search and random search

In [1]:
# Hide warnings (to keep notebook tidy; do not usually do this)
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd
# Import machine learning methods
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [4]:
data = pd.read_csv('C:/t_data/processed_data.csv')
# Make all data 'float' type
data = data.astype(float)

In [5]:
# Drop Passengerid (axis=1 indicates we are removing a column rather than a row)
# We drop passenger ID as it is not original data

data.drop('PassengerId', inplace=True, axis=1)

In [6]:
#Divide into X (features) and y (labels)
X = data.drop('Survived',axis=1) # X = all 'data' except the 'survived' column
y = data['Survived'] # y = 'survived' column from 'data'

## Standardise data

In [7]:
# Initialise a new scaling object for normalising input data
sc = StandardScaler() 

# Set up the scaler just on the training set
sc.fit(X)

# Apply the scaler to the X data
X_std=sc.transform(X)

## Grid search

Grid serach is a good method so long as the number of paramater combinations is not too high

In [8]:
param_grid = {'penalty': ['l1', 'l2'],
              'C': [0.01, 0.1, 1, 10],
              'class_weight': [{0:0.5, 1:0.5},{0:0.38, 1:0.62}]}

# Class weight is defined as a dictionary with class label and weight.

In the above paraemter grid we have 2 * 4 * 2 parameter combinations = 16

### Run grid search with defined parameters

In [9]:
# Import GridSearch
from sklearn.model_selection import GridSearchCV

# Define model
model = LogisticRegression()

# Define grid search to use 5 k-fold validation, and use 'f1' for accuracy
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1')

# Run grid search
grid_search.fit(X_std, y); #';' suppresses printed output

### Show grid search performance

In [10]:
# show best performance and parameters
# If best parameters are at the extremes of the searches then extend the range

print ('Best performance (f1):')
print (grid_search.best_score_)
print ('Best parameters:')
print (grid_search.best_params_)

Best performance (f1):
0.7355099445362381
Best parameters:
{'C': 0.1, 'class_weight': {0: 0.38, 1: 0.62}, 'penalty': 'l2'}


In [11]:
grid_search.best_estimator_

LogisticRegression(C=0.1, class_weight={0: 0.38, 1: 0.62}, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
results = pd.DataFrame(grid_search.cv_results_)
cols_to_show = ['param_penalty','param_C', 'param_class_weight',
                'mean_test_score','rank_test_score' ]
print(results[cols_to_show])

   param_penalty param_C  param_class_weight  mean_test_score  rank_test_score
0             l1    0.01    {0: 0.5, 1: 0.5}         0.000000               15
1             l2    0.01    {0: 0.5, 1: 0.5}         0.714713               13
2             l1    0.01  {0: 0.38, 1: 0.62}         0.000000               15
3             l2    0.01  {0: 0.38, 1: 0.62}         0.730961                2
4             l1     0.1    {0: 0.5, 1: 0.5}         0.709479               14
5             l2     0.1    {0: 0.5, 1: 0.5}         0.722657                8
6             l1     0.1  {0: 0.38, 1: 0.62}         0.724712                6
7             l2     0.1  {0: 0.38, 1: 0.62}         0.735510                1
8             l1       1    {0: 0.5, 1: 0.5}         0.721540                9
9             l2       1    {0: 0.5, 1: 0.5}         0.720458               11
10            l1       1  {0: 0.38, 1: 0.62}         0.723316                7
11            l2       1  {0: 0.38, 1: 0.62}        

When looking at the results, it is worth noting the range of results. Tou may then consider whether it is worth refining the grid search to focus on a narrower area.

## Random search

Random search is very similar to grid search, but randomly selects combinations of parameters to test, with the maximum number of tests given by the `n_iter` argument.

As we've been through the process with grid search, we'll put all our code together here, but note the larger number of parameters defined.

In [13]:
# Import GridSearch
from sklearn.model_selection import RandomizedSearchCV

# Define paraemter grid and maximum number of tests

param_grid = {'penalty': ['l1', 'l2'],
              'C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10],
              'class_weight': [{0:0.5, 1:0.5},
                               {0:0.38, 1:0.62},
                               {0:0.62, 1:0.38}],
              'max_iter': [30, 100, 300, 1000]}

n_iter_search = 50

# Define model
model = LogisticRegression()

# Set up random search
random_search = RandomizedSearchCV(model, param_grid, cv=5,
                           n_iter=n_iter_search, scoring='f1')

# Run grid search
random_search.fit(X_std, y); #';' suppresses printed output

# Get and print output
print ('Best performance (f1):')
print (random_search.best_score_)
print ('Best parameters:')
print (random_search.best_params_)

Best performance (f1):
0.7355099445362381
Best parameters:
{'penalty': 'l2', 'max_iter': 100, 'class_weight': {0: 0.38, 1: 0.62}, 'C': 0.1}




Print all tests

In [14]:
results = pd.DataFrame(random_search.cv_results_)
cols_to_show = ['param_penalty','param_C', 'param_class_weight',
                'mean_test_score','rank_test_score' ]
print(results[cols_to_show])

   param_penalty param_C  param_class_weight  mean_test_score  rank_test_score
0             l1       3    {0: 0.5, 1: 0.5}         0.721539               13
1             l2    0.03    {0: 0.5, 1: 0.5}         0.731564                2
2             l2     0.3  {0: 0.62, 1: 0.38}         0.714191               21
3             l1       3  {0: 0.62, 1: 0.38}         0.704302               33
4             l1      10    {0: 0.5, 1: 0.5}         0.720458               16
5             l1     0.3    {0: 0.5, 1: 0.5}         0.721433               14
6             l1     0.1  {0: 0.38, 1: 0.62}         0.724712                7
7             l1     0.1  {0: 0.38, 1: 0.62}         0.724712                7
8             l2      10  {0: 0.62, 1: 0.38}         0.703537               36
9             l2    0.01  {0: 0.62, 1: 0.38}         0.650990               43
10            l2      10  {0: 0.38, 1: 0.62}         0.727679                6
11            l2      10    {0: 0.5, 1: 0.5}        