In [3]:
import pandas as pd
import numpy as np

# import classes for imputation
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# import extra classes for modelling
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

np.random.seed(0)

In [4]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

### Filter Variables and store Names

In [5]:
# categorical variables
# those of type 'Object' in the dataset
features_categorical = [c for c in train.columns if train[c].dtypes=='O' and c !='Name' and c !='Ticket' and c !='Cabin']
features_categorical

['Sex', 'Embarked']

In [6]:
# numerical variables
# those different from object and also excluding the target SalePrice
features_numerical = [c for c in train.columns if train[c].dtypes!='O' and c !='PassengerId' and c !='Survived']
features_numerical

['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

### Divide Dataset into Train and Test

In [8]:
#create X and y datasets for splitting 
X = train.drop(['Survived', 'Name', 'PassengerId', 'Ticket', 'Cabin'], axis=1)
y = train['Survived']

In [9]:
#import train_test_split library
from sklearn.model_selection import train_test_split

In [10]:
# create train test split
X_train, X_test, y_train, y_test = train_test_split( X,  y, test_size=0.3, random_state=0)  

### Build Numerical and Categorical Pipeline

In [11]:
#Create numeric pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy = 'median')),
])

In [12]:
#Create categorical pipeline
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

### Combine both tranformations

In [14]:
#create preprocessor 
preprocessor = ColumnTransformer(
transformers=[
    ('numerical', numeric_transformer, features_numerical),
    ('categorical', categorical_transformer, features_categorical)
])

### Add Logistic Regression Model to the Pipeline

In [15]:
#build pipeline to build model
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(solver='liblinear'))
])

In [16]:
#parameter grid
param_grid = {
    'preprocessor__numerical__imputer__strategy': ['mean', 'median'],
    'preprocessor__categorical__imputer__strategy': ['most_frequent', 'constant'],
    'classifier__max_iter': [100,200,300],
}

grid_search = GridSearchCV(clf, param_grid, cv = 5, iid = False, n_jobs = -1, scoring = 'r2')

In [17]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
857,1,male,51.0,0,0,26.55,S
52,1,female,49.0,1,0,76.7292,C
386,3,male,1.0,5,2,46.9,S
124,1,male,54.0,0,1,77.2875,S
578,3,female,,1,0,14.4583,C


In [18]:
#fit model
grid_search.fit(X_train, y_train)

print(("best model: %.3f" % grid_search.score(X_train, y_train)))

best model: 0.176


In [19]:
grid_search.best_params_

{'classifier__max_iter': 100,
 'preprocessor__categorical__imputer__strategy': 'most_frequent',
 'preprocessor__numerical__imputer__strategy': 'mean'}

In [20]:
grid_search.cv_results_['mean_test_score']

array([0.15585311, 0.15585311, 0.15585311, 0.15585311, 0.15585311,
       0.15585311, 0.15585311, 0.15585311, 0.15585311, 0.15585311,
       0.15585311, 0.15585311])

In [21]:
# and finally let's check the performance over the test set
print(("Test Score: %.3f" % grid_search.score(X_test, y_test)))

Test Score: 0.155


In [22]:
sub_test_pred = grid_search.predict(test)

In [23]:
AllSub = pd.DataFrame({ 'PassengerId': test['PassengerId'],
                       'Survived' : sub_test_pred
    
})

AllSub.to_csv("Pipeline_LR_Sel_Var1.csv", index = False)