# Titanic Solution with Pipeline 
- Pipeline (make_pipeline)
- Column Transformers (make_colunm_tranformer)
- Logisitic Model
- Basic Transformers (SimpleImputer, KBinsDiscretizer, OneHotEncoder)
- Performance Measure - accuracy_score, log_loss

## Import Libraries

In [19]:
import pandas as pd

import sklearn.model_selection as model_selection

from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder, FunctionTransformer

from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import cross_val_score

## Import Data

In [8]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

## Check Information of Columns

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


## Divide Data into Target and X Variables

In [10]:
y = train['Survived']
X = train.drop(['Survived','PassengerId'], axis = 1)

## Split Titanic Train data into Train and Test Dataset

In [11]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,test_size=0.2, random_state = 200)

## Make Feature Classification

In [12]:
numeric_spl_features = ['Pclass','SibSp', 'Parch']

object_spl_features = ['Name', 'Ticket', 'Cabin']

numerical_features = [c for c, dtype in zip(X.columns, X.dtypes)
                     if dtype.kind in ['i','f'] and c not in numeric_spl_features]
categorical_features = [c for c, dtype in zip(X.columns, X.dtypes)
                     if dtype.kind not in ['i','f'] and c not in object_spl_features]

Pclass_spl_feature = ['Pclass']

#numercial_spl_features = ['SibSP','Parch']

cabin_spl_feature = ['Cabin']

name_spl_feature = ['Name']

ticket_spl_feature = ['Ticket']

In [13]:
print('Numerical : ' + str(numerical_features))
print('Categorical : ' + str(categorical_features))
print('Numeric Special : ' + str(numeric_spl_features))

Numerical : ['Age', 'Fare']
Categorical : ['Sex', 'Embarked']
Numeric Special : ['Pclass', 'SibSp', 'Parch']


## Make Column Transformer with multiple Pipeline

In [22]:
preprocessor = make_column_transformer(
    
    (make_pipeline(
    SimpleImputer(strategy = 'median'),
    KBinsDiscretizer(n_bins=4)), numerical_features),
    
    (make_pipeline(
    OneHotEncoder(categories = 'auto',handle_unknown = 'ignore')), numeric_spl_features),
    
    (make_pipeline(
    SimpleImputer(strategy = 'most_frequent'),
    OneHotEncoder(categories = 'auto', handle_unknown = 'ignore')), categorical_features),
    
)

## Import Different Models

In [15]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

### Store Models in a List

In [16]:
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", C=0.025, probability=True),
    NuSVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier()
    ]

In [38]:
classifiers_names = [
    'KNeighborsClassifier(3)',
    'SVC(kernel="rbf", C=0.025, probability=True)',
    'NuSVC(probability=True)',
    'DecisionTreeClassifier()',
    'RandomForestClassifier()',
    'AdaBoostClassifier()',
    'GradientBoostingClassifier()'
    ]

## Check Train and Test Scores

In [30]:
#Train_Score
train_scores = cross_val_score(logModel, X_train, y_train, cv = 7)

#Test Score
test_scores = cross_val_score(logModel, X_test, y_test, cv = 7)

#Print Train and Test Score
print(f'Train Accuracy : {train_scores.mean():.3f} +/- {train_scores.std():.2f}')
print(f'Test Accuracy : {test_scores.mean():.3f} +/- {test_scores.std():.2f}')

Train Accuracy : 0.774 +/- 0.03
Test Accuracy : 0.686 +/- 0.06


In [None]:
col_train_scores = []
col_test_scores = []

In [42]:
for classifier,classifiers_name in zip(classifiers,classifiers_names):
    pipe = make_pipeline(preprocessor, classifier)
    train_scores = cross_val_score(pipe, X_train, y_train, cv = 7)
    test_scores = cross_val_score(pipe, X_test, y_test, cv = 7)
    #pipe.fit(X_train, y_train)   
    print(classifiers_name)
    #print("model score: %.3f" % pipe.score(X_test, y_test))
    print(f'Train Accuracy : {train_scores.mean():.3f} +/- {train_scores.std():.2f}')
    print(f'Test Accuracy : {test_scores.mean():.3f} +/- {test_scores.std():.2f}')
    print(f'Diff : {train_scores.mean() - test_scores.mean():.3f} ')

KNeighborsClassifier(3)
Train Accuracy : 0.806 +/- 0.03
Test Accuracy : 0.704 +/- 0.06
Diff : 0.102 
SVC(kernel="rbf", C=0.025, probability=True)
Train Accuracy : 0.625 +/- 0.00
Test Accuracy : 0.581 +/- 0.01
Diff : 0.044 
NuSVC(probability=True)
Train Accuracy : 0.822 +/- 0.03
Test Accuracy : 0.754 +/- 0.06
Diff : 0.068 
DecisionTreeClassifier()
Train Accuracy : 0.774 +/- 0.03
Test Accuracy : 0.681 +/- 0.06
Diff : 0.093 
RandomForestClassifier()
Train Accuracy : 0.809 +/- 0.03
Test Accuracy : 0.720 +/- 0.05
Diff : 0.089 
AdaBoostClassifier()
Train Accuracy : 0.800 +/- 0.02
Test Accuracy : 0.754 +/- 0.06
Diff : 0.046 
GradientBoostingClassifier()
Train Accuracy : 0.813 +/- 0.03
Test Accuracy : 0.721 +/- 0.06
Diff : 0.092 


In [43]:
bestModel = make_pipeline(preprocessor,  NuSVC(probability=True))

In [44]:
#Train_Score
train_scores = cross_val_score(bestModel, X_train, y_train, cv = 7)

#Test Score
test_scores = cross_val_score(bestModel, X_test, y_test, cv = 7)

#Print Train and Test Score
print(f'Train Accuracy : {train_scores.mean():.3f} +/- {train_scores.std():.2f}')
print(f'Test Accuracy : {test_scores.mean():.3f} +/- {test_scores.std():.2f}')

Train Accuracy : 0.822 +/- 0.03
Test Accuracy : 0.754 +/- 0.06


## Fit Model 

In [45]:
bestModel.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('pipeline-1',
                                                  Pipeline(memory=None,
                                                           steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                           

## Extract only X Variables for Prediction

In [46]:
X_submission = test.drop(['PassengerId'], axis = 1)
X_submission.columns

Index(['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Cabin', 'Embarked'],
      dtype='object')

## Make Prediction and Export CSV

In [47]:
prediction = bestModel.predict(X_submission).astype(int)

In [48]:
#Create Submission dataframe 
submission_df = pd.DataFrame({'PassengerId' : test['PassengerId'], 'Survived' : prediction})

submission_df.to_csv('NuSVC_Model_Pipeline.csv', index = False)

In [None]:
## Scores 0.77033 when Submitted to Kaggle