In [1]:
import pandas as pd

#Preprocessing 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures

#Models
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

#Train-Test Split and Grid search 
from sklearn.model_selection import train_test_split, GridSearchCV

# Pipeline
from sklearn.pipeline import Pipeline   # Sequentially apply a list of transformations
from sklearn.compose import ColumnTransformer # Applies in parallel transformations to columns
from sklearn.preprocessing import FunctionTransformer # it makes functions compatible with scikit-learn pipelines

#Model Accuracy
from sklearn.metrics import confusion_matrix, accuracy_score


# The Titanic dataset

**Goal** is the predict the whether the passenger survived or not.

**Numerical features**:
- Age
- Fare

**Categorical features**:
- Sex
- Ticket
- Cabin
- Embarked (Port of Embarkation: C, Q, and S)

**Ordinal features**:
- Pclass (passenger class)
- SibSp (number of siblings / spouses aboard the Titanic)
- Parch (number of parents / children aboard the Titanic)

In [2]:
#Reading data and viewing top 5 rows
path = '/Users/liamhettinger/Documents/Portfolio_work/Data/titanic.csv'
titanic = pd.read_csv(path)
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
#values counts for rows
for col in titanic.columns.drop(['PassengerId','Name']):
    print(titanic[col].value_counts())
    print('_' * 50)

Survived
0    549
1    342
Name: count, dtype: int64
__________________________________________________
Pclass
3    491
1    216
2    184
Name: count, dtype: int64
__________________________________________________
Sex
male      577
female    314
Name: count, dtype: int64
__________________________________________________
Age
24.00    30
22.00    27
18.00    26
19.00    25
28.00    25
         ..
36.50     1
55.50     1
0.92      1
23.50     1
74.00     1
Name: count, Length: 88, dtype: int64
__________________________________________________
SibSp
0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: count, dtype: int64
__________________________________________________
Parch
0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: count, dtype: int64
__________________________________________________
Ticket
347082      7
CA. 2343    7
1601        7
3101295     6
CA 2144     6
           ..
9234        1
19988       1
2693        1
PC 17612    1
370376      

## Creating functions that create new features

Creating functions `get_title` and `get_family_size` to get Title and Family size for each passengers

**Title feature:** A function that extract the title (Mr, Mrs, Miss, Dr, etc) from the name

In [4]:
titanic.Name.head()

0                              Braund, Mr. Owen Harris
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                               Heikkinen, Miss. Laina
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                             Allen, Mr. William Henry
Name: Name, dtype: object

In [5]:
titanic.Name[0].split(",")[1].split('.')[0].strip()

'Mr'

In [6]:
# Title feature
title = titanic.Name.apply(lambda x: x.split(",")[1].split(".")[0].strip())

In [7]:
title.value_counts()

Name
Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Mlle              2
Major             2
Col               2
the Countess      1
Capt              1
Ms                1
Sir               1
Lady              1
Mme               1
Don               1
Jonkheer          1
Name: count, dtype: int64

In [8]:
# replace the less common titles with 'Other'
less_common_titles = title.value_counts().tail(11).index.to_list()
title[title.isin(less_common_titles)] = 'Other'

In [9]:
title.value_counts()

Name
Mr        517
Miss      182
Mrs       125
Master     40
Other      14
Dr          7
Rev         6
Name: count, dtype: int64

In [10]:
def get_title(dataframe):
    df = dataframe.copy()
    df['Title'] = df.Name.apply(lambda x: x.split(",")[1].split(".")[0].strip())
    df.loc[df.Title.isin(less_common_titles),'Title'] = 'Other'
    return df

In [11]:
# check that it works
get_title(titanic).Title.value_counts()

Title
Mr        517
Miss      182
Mrs       125
Master     40
Other      14
Dr          7
Rev         6
Name: count, dtype: int64

**Family size feature:** A function that computes the size of a family.

In [12]:
# family size
family_size = titanic.SibSp+titanic.Parch+1
family_size

0      2
1      2
2      1
3      2
4      1
      ..
886    1
887    1
888    4
889    1
890    1
Length: 891, dtype: int64

In [13]:
def get_family_size(dataframe):
    df = dataframe.copy()
    df['Family_size'] = df.SibSp + df.Parch + 1
    return df

In [14]:
# check that it works
get_family_size(titanic).Family_size.head()

0    2
1    2
2    1
3    2
4    1
Name: Family_size, dtype: int64

Build a transformer that applies our two functions `get_title` and `get_family_size` in a a pipeline, use `FunctionTransformer`.

In [15]:
# Functions for the pipeline
family_size_processor = FunctionTransformer(get_family_size)
title_processor = FunctionTransformer(get_title)

 ## Column transformers and pipelines

In [16]:
# numerical features pipeline: impute+scale
numeric_features = ['Age', 'Fare']
numeric_processor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
    ])

In [17]:
# categorical_features pipeline: impute+encode
categorical_features = ['Embarked', 'Sex','Title']
categorical_processor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder())])

In [18]:
# ordinal_features pipeline: impute
ordinal_features = ['Family_size', 'Pclass']
ordinal_processor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

In [19]:
feature_processor = ColumnTransformer(
    transformers=[
        ('num', numeric_processor, numeric_features),
        ('cat', categorical_processor, categorical_features),
        ('ord', ordinal_processor, ordinal_features)],
         remainder='drop') # drop 'Name', 'SibSp', 'Parch' columns

## KNN model 

`get_title` function, `get_family_size` function, `preprocessor`, `poly_features` and `model`

In [20]:
# Defining X and Y
feature_cols = ['Name','Age','Fare','Sex','Embarked','Pclass','SibSp','Parch']
X = titanic[feature_cols] 
y = titanic.Survived

In [21]:
# train/test split
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [22]:
#defining model
knn_clf = KNeighborsClassifier()

In [23]:
pipe = Pipeline(steps=[('get family_size', family_size_processor), #function created above
                           ('get title', title_processor), #function created above
                           ('preprocessor', feature_processor), #Feature engineering
                           ('poly_features', PolynomialFeatures(degree=2)), # add polynomial combinations of the features
                           ('clf',knn_clf)   #model
                          ])

In [24]:
#establishing parameters
param_grid = { 
    'clf__n_neighbors': list(range(1,21)), #Number of neighbors in the KNN model.
}

In [25]:
# instantiate and fit the grid
grid = GridSearchCV(pipe, param_grid, cv=10, scoring='accuracy', n_jobs=-1, verbose=1)

In [26]:
#fitting grid search
grid.fit(X_train, y_train)

Fitting 10 folds for each of 20 candidates, totalling 200 fits


In [27]:
# view the results
pd.DataFrame(grid.cv_results_).sort_values(by='mean_test_score', ascending = False)[['mean_test_score', 'params']].head()

Unnamed: 0,mean_test_score,params
16,0.820488,{'clf__n_neighbors': 17}
14,0.819019,{'clf__n_neighbors': 15}
15,0.818996,{'clf__n_neighbors': 16}
13,0.818973,{'clf__n_neighbors': 14}
4,0.817481,{'clf__n_neighbors': 5}


In [28]:
# best hyper-parameters
# 2 params have same mean_test_score. The params with the least n_neighbors was used
grid.best_params_

{'clf__n_neighbors': 17}

In [29]:
# best accuracy
grid.best_score_

0.8204884667571234

In [30]:
# best predictor
best_clf = grid.best_estimator_

### Test the model

In [31]:
y_test_pred = best_clf.predict(X_test)

In [32]:
clf_confusion = confusion_matrix(y_test,y_test_pred)
clf_confusion

array([[133,   9],
       [ 26,  55]])

In [33]:
clf_accuracy = accuracy_score(y_test,y_test_pred)
clf_accuracy

0.8430493273542601

## XGBoost

In [34]:
#Defining model
xgb = XGBClassifier()

In [35]:
#Replacing KNN model with XGB model
pipe.steps.pop(4)
pipe.steps.append(['XGB',xgb])
pipe

- **learning_rate** Boosting assigns feature weights. The learning rate shrinks those weights to prevent overfitting. Step size shrinkage. Prevents overfitting. range: [0,1]
- **max_depth** Maximum depth of a tree. Larger number of trees can cause overfitting. 0 means no limit. range: [0,∞]

In [36]:
#estabilishing parameters
params = {
    "XGB__learning_rate": [.1, .2, .3], 
    "XGB__max_depth": [2, 5, 10, 15]
    }

In [37]:
# instantiate and fit the grid
grid = GridSearchCV(pipe, params, cv=10, scoring='accuracy', n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)

Fitting 10 folds for each of 12 candidates, totalling 120 fits


In [38]:
# view the results
pd.DataFrame(grid.cv_results_).sort_values(by='mean_test_score', ascending = False)[['mean_test_score', 'params']].head()

Unnamed: 0,mean_test_score,params
1,0.818951,"{'XGB__learning_rate': 0.1, 'XGB__max_depth': 5}"
8,0.812913,"{'XGB__learning_rate': 0.3, 'XGB__max_depth': 2}"
0,0.811578,"{'XGB__learning_rate': 0.1, 'XGB__max_depth': 2}"
4,0.807033,"{'XGB__learning_rate': 0.2, 'XGB__max_depth': 2}"
10,0.793419,"{'XGB__learning_rate': 0.3, 'XGB__max_depth': 10}"


In [39]:
# best hyper-parameters
# 2 params have same mean_test_score. The params with the least n_neighbors was used
grid.best_params_

{'XGB__learning_rate': 0.1, 'XGB__max_depth': 5}

In [40]:
# best accuracy
grid.best_score_

0.8189507010402532

In [41]:
# best predictor
best_xgb = grid.best_estimator_

In [42]:
#predicting y_test data
y_test_pred = best_xgb.predict(X_test)

In [43]:
#KNN confusion matrix
clf_confusion

array([[133,   9],
       [ 26,  55]])

In [44]:
#XGB confusion matrix
confusion_matrix(y_test,y_test_pred)

array([[131,  11],
       [ 21,  60]])

In [45]:
#KNN accuracy score
clf_accuracy

0.8430493273542601

In [46]:
#XGB accuracy score
accuracy_score(y_test,y_test_pred)

0.8565022421524664