### Packages

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

# preprocessing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

# model selection
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.model_selection import GridSearchCV

# models
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

### Data

Reading in data, dummifying variables, and splitting into training set and validation set

In [2]:
all_train = pd.read_csv("datasets/train.csv").drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
all_train['Pclass'] = all_train['Pclass'].astype(str)
X = all_train.drop('Survived', axis=1)
y = all_train['Survived']

Defining data transformations

In [3]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numeric_features = all_train.select_dtypes(include=['int64', 'float64']).drop('Survived', axis=1).columns

categorical_features = all_train.select_dtypes(include='object').columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cate', categorical_transformer, categorical_features)
    ])

### Models

In [11]:
classifiers = [
    KNeighborsClassifier(3),
    LinearSVC(C=0.025), # I get convergence issues if I use default C = 1
    SVC(),
    NuSVC(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    LinearDiscriminantAnalysis(),
    ]


rows = []
for classifier in classifiers:
    this_dict = dict()
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', classifier)])
    scores = cross_val_score(pipe, X, y, cv=5)  
    this_dict['classifier'] = str(classifier)
    this_dict['CV accuracy'] = round(scores.mean(), 4)
    this_dict['sd'] = round(scores.std(), 4)
    rows.append(this_dict)
    
out_df = pd.DataFrame(rows).sort_values(by='CV accuracy', ascending=False)

out_df

Unnamed: 0,classifier,CV accuracy,sd
7,GradientBoostingClassifier(),0.8306,0.0194
2,SVC(),0.826,0.0178
3,NuSVC(),0.8227,0.0132
6,AdaBoostClassifier(),0.807,0.0335
5,RandomForestClassifier(),0.8059,0.0278
1,LinearSVC(C=0.025),0.7912,0.0244
8,LinearDiscriminantAnalysis(),0.7912,0.0202
0,KNeighborsClassifier(n_neighbors=3),0.7901,0.0213
4,DecisionTreeClassifier(),0.7812,0.0289


Hyperparameter Tuning

In [None]:
gbtree = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', GradientBoostingClassifier())])

param_grid = {
    'classifier__loss': ['deviance', 'exponential'],
    'classifier__learning_rate': [0.1, 0.001, 0.0001],
    'classifier__n_estimators': [100, 200, 300, 400, 500],
    'classifier__criterion': ['friedman_mse', 'mse', 'mae'],
    'classifier__max_depth': [2, 3, 4, 5, 6],
    'classifier__max_features': ['auto', 'sqrt', 'log2']
}


gs = GridSearchCV(gbtree, param_grid, n_jobs= 1)
                  
gs.fit(X, y)  
print(CV.best_params_)    
print(CV.best_score_)