In [2]:
import numpy as np
import pandas as pd

## Reading the data

In [3]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [4]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
train.set_index('PassengerId', inplace = True)
test.set_index('PassengerId', inplace = True)

In [6]:
train.isna().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [7]:
features_to_drop = ['Name', 'Ticket', 'Cabin']
train.drop(columns = features_to_drop, inplace = True)
test.drop(columns = features_to_drop, inplace = True)

In [8]:
train.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,male,22.0,1,0,7.25,S
2,1,1,female,38.0,1,0,71.2833,C
3,1,3,female,26.0,0,0,7.925,S
4,1,1,female,35.0,1,0,53.1,S
5,0,3,male,35.0,0,0,8.05,S


In [9]:
train.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [10]:
test.isna().sum()

Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

## Taking a look at the features

TODO

## Creating a function to estimate different models. 
Because the score in the competition is calculated on the percentage of passengers that were correctly predicted, we need to use accuracy_score metric

In [16]:
def model_estimation(y_true, y_pred):
    from sklearn.metrics import accuracy_score
    return accuracy_score(y_true, y_pred)

## Creating a pipeline

In [15]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

train.dropna(subset = ['Embarked'], inplace = True)

num_features = ['Age', 'Fare']
num_pipe = Pipeline(steps = [
    ('imputer', SimpleImputer())
])

cat_features = ['Sex', 'Embarked']
cat_pipe = Pipeline(steps = [
    ('cat_encoder', OneHotEncoder())
])

column_transformer = ColumnTransformer(
    transformers = [
        ('num_transform', num_pipe, num_features), 
        ('cat_transform', cat_pipe, cat_features)
    ]
)

model = Pipeline(
    steps = [
        ('preprocessor', column_transformer), 
        ('model', RandomForestClassifier())
    ]
)

## Testing different models on the pipeline

In [98]:
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

models = {
    'linearSVC': LinearSVC(dual = 'auto', random_state = 0),
    'KNeighbors': KNeighborsClassifier(),
    'SVC': SVC(random_state = 0),
    'GradientBoosting': GradientBoostingClassifier(random_state = 1), 
    'RandomForest': RandomForestClassifier(random_state = 1)
}


models_metrics = {}

for model in models:
    clf = Pipeline(
        steps = [
            ('preprocessor', column_transformer), 
            ('model', models[model])
        ])

    clf.fit(X_train, y_train)
    y_preds = clf.predict(X_test)

    models_metrics[model] = round(model_estimation(y_test, y_preds), 4)

models_metrics

{'linearSVC': 0.8146,
 'KNeighbors': 0.7135,
 'SVC': 0.736,
 'GradientBoosting': 0.8427,
 'RandomForest': 0.8258}

### Scores on different models
1. linearSVC: 0.8146,
2. KNeighbors: 0.7135
3. SVC: 0.736,
4. GradientBoosting: `0.8427`
5. RandomForest: `0.8258`

As we can see, ensemble classifiers (GradientBoosting and RandomForest) appear to have bigger accuracy score. These are the models that we'll be trying to further improve. 
- We have to be carefull though, because the high score can possibly mean that the model is overtrained

### Improving models

In [52]:
def create_pipeline_model(clf):
    model = Pipeline(
        steps = [
            ('preprocessor', column_transformer), 
            ('model', clf)
        ])
    return model

Improving RandomForest

In [53]:
from sklearn.model_selection import GridSearchCV

model = create_pipeline_model(RandomForestClassifier(random_state = 1))

pipe_grid = {
    'preprocessor__num_transform__imputer__strategy': ['mean', 'median'],
    'model__n_estimators': [100, 250, 300, 400, 1000],
    'model__max_depth': [None, 5],
    'model__max_features': [None], 
    'model__min_samples_split': [2, 4]
}

gs_model = GridSearchCV(model, pipe_grid, cv = 4, verbose = 2)
y_preds = gs_model.fit(X_train, y_train).predict(X_test)

gs_RandomForest_metrics = model_estimation(y_test, y_preds)

Fitting 4 folds for each of 40 candidates, totalling 160 fits
[CV] END model__max_depth=None, model__max_features=None, model__min_samples_split=2, model__n_estimators=100, preprocessor__num_transform__imputer__strategy=mean; total time=   0.3s
[CV] END model__max_depth=None, model__max_features=None, model__min_samples_split=2, model__n_estimators=100, preprocessor__num_transform__imputer__strategy=mean; total time=   0.2s
[CV] END model__max_depth=None, model__max_features=None, model__min_samples_split=2, model__n_estimators=100, preprocessor__num_transform__imputer__strategy=mean; total time=   0.2s
[CV] END model__max_depth=None, model__max_features=None, model__min_samples_split=2, model__n_estimators=100, preprocessor__num_transform__imputer__strategy=mean; total time=   0.2s
[CV] END model__max_depth=None, model__max_features=None, model__min_samples_split=2, model__n_estimators=100, preprocessor__num_transform__imputer__strategy=median; total time=   0.2s
[CV] END model__max_d

In [101]:
# gs_model.score(X_test, y_test)
gs_RandomForest_metrics

0.8426966292134831

Improving GradientBoosting model

In [86]:
from sklearn.model_selection import RandomizedSearchCV

model = create_pipeline_model(GradientBoostingClassifier(random_state = 1))

gb_grid_search_pipe = {
    'preprocessor__num_transform__imputer__strategy': ['mean', 'median'],
    'model__n_estimators': [100, 200, 300, 400, 500], 
    'model__loss': ['log_loss', 'exponential'],
    'model__max_features': [None, 'sqrt', 'log2'],
    "model__learning_rate": [0.01, .2, 0.075, 0.1],
    'model__min_samples_split': [2, 4, 6]
}

gb_model = GridSearchCV(model, gb_grid_search_pipe, verbose = 4, cv = 2)
y_preds = gb_model.fit(X_train, y_train).predict(X_test)

gs_GradientBoosting_metrics = model_estimation(y_test, y_preds)

Fitting 2 folds for each of 720 candidates, totalling 1440 fits
[CV 1/2] END model__learning_rate=0.01, model__loss=log_loss, model__max_features=None, model__min_samples_split=2, model__n_estimators=100, preprocessor__num_transform__imputer__strategy=mean;, score=0.753 total time=   0.1s
[CV 2/2] END model__learning_rate=0.01, model__loss=log_loss, model__max_features=None, model__min_samples_split=2, model__n_estimators=100, preprocessor__num_transform__imputer__strategy=mean;, score=0.766 total time=   0.0s
[CV 1/2] END model__learning_rate=0.01, model__loss=log_loss, model__max_features=None, model__min_samples_split=2, model__n_estimators=100, preprocessor__num_transform__imputer__strategy=median;, score=0.753 total time=   0.0s
[CV 2/2] END model__learning_rate=0.01, model__loss=log_loss, model__max_features=None, model__min_samples_split=2, model__n_estimators=100, preprocessor__num_transform__imputer__strategy=median;, score=0.766 total time=   0.0s
[CV 1/2] END model__learning

In [89]:
gs_GradientBoosting_metrics

0.8258426966292135

In [100]:
# y_preds = gs_model.predict(test)
# y_preds

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [111]:
# solution = pd.DataFrame(test.index, y_preds)

In [116]:
# solution = pd.DataFrame({'PassengerId': test.index, 'Transported': y_preds})
# solution.to_csv('solution.csv')