In [1]:
from sklearn import svm, datasets
from sklearn.linear_model import LogisticRegression, Lasso, RidgeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import randint

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn import metrics

import re

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)



import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
# Load training and test data.
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

In [3]:
#Feature Engineering; after EDA, extract title from feature 'name', and created new feature.
def extract_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ''
train_data['Title'] = train_data['Name'].apply(extract_title)
test_data['Title'] = test_data['Name'].apply(extract_title)
title_mapping = {
    'Capt': 'Officer', 'Col': 'Officer', 'Major': 'Officer', 
    'Dr': 'Rare', 'Rev': 'Rare', 'Don': 'Rare', 'Sir': 'Noble', 
    'Mme': 'Mrs', 'Mlle': 'Miss', 'Ms': 'Miss', 'Lady': 'Noble',
    'Jonkheer': 'Noble', 'Mr': 'Mr', 'Mrs': 'Mrs', 'Miss': 'Miss', 'Master': 'Master'
}

train_data['Title'] = train_data['Title'].map(title_mapping)
test_data['Title'] = test_data['Title'].map(title_mapping)



In [4]:
#Separated categorical and numerical values.
y = train_data.Survived
X = train_data.drop(columns=['Survived','PassengerId','Name','Ticket','Cabin'])
X_test = test_data.drop(columns=['PassengerId','Name','Ticket','Cabin'])
num_cols = X.select_dtypes(include=np.number).columns
cat_cols = X.select_dtypes(include=['object']).columns

In [5]:
#Preprocessed variables
for i in range(1000):
    X.loc[np.random.choice(X.index),np.random.choice(X.columns)] = np.nan

x_train, x_test, y_train, y_test = train_test_split(X,y, random_state=0, test_size=0.25)

cat_vals = Pipeline([("imputer",SimpleImputer(strategy='most_frequent')), ("ohe",OneHotEncoder(sparse_output = False, drop='first', handle_unknown='ignore'))])
num_vals = Pipeline([("imputer",SimpleImputer(strategy='mean')), ("scale",StandardScaler())])

preprocess = ColumnTransformer(
    transformers=[
        ("cat_preprocess", cat_vals, cat_cols),
        ("num_preprocess", num_vals, num_cols)
    ]
)

In [6]:
#Created a range of 400 evenly spaced values between 1 and 200 for alpha in certain regression models.
alpha_range = np.linspace(1, 200, num=400)

In [7]:
#For practice, created robust Pipeline using various models and hyperparameters as a search space.

pipeline = Pipeline([("preprocess",preprocess), 
                     ("regr",LogisticRegression())])
search_space = [{'regr': [LogisticRegression(solver='liblinear',max_iter=2000)], 'regr__penalty': ['l1', 'l2'], 'regr__C': [0.01,0.1,1,10,100], 'regr__fit_intercept': [True,False]},
                {'regr':[RidgeClassifier()],
                     'regr__alpha': alpha_range},
               {'regr': [RandomForestClassifier()], 
                 'regr__n_estimators': [50, 100, 200], 
                 'regr__max_depth': [None, 10, 20, 30, 50, 70], 
                 'regr__min_samples_split': [2, 5, 10], 
                 'regr__min_samples_leaf': [1, 2, 4],
                 'regr__bootstrap': [True, False]}, 
                {'regr': [AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1))], 
                 'regr__n_estimators': [50, 100, 200],
                 'regr__learning_rate': [0.01, 0.1, 1, 10]}]

In [8]:
gs = GridSearchCV(pipeline, search_space, scoring='accuracy', cv=5)


In [9]:
gs.fit(x_train, y_train)

In [10]:
best_pipeline = gs.best_estimator_

In [11]:
best_regression_model = best_pipeline.named_steps['regr']
print('The best regression model is:')
print(best_regression_model)

The best regression model is:
RandomForestClassifier(bootstrap=False, max_depth=70, min_samples_leaf=4,
                       n_estimators=50)


In [12]:
best_model_hyperparameters = best_regression_model.get_params()
print('The hyperparameters of the regression model are:')
print(best_model_hyperparameters)

The hyperparameters of the regression model are:
{'bootstrap': False, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 70, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 4, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 50, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [13]:
score_model = best_pipeline.score(x_test,y_test)
print('The score:')
print(score_model)

The score:
0.8161434977578476


In [14]:
predictions = best_pipeline.predict(X_test)

In [15]:
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
