# <center>Titanic: Machine Learning from Disaster<center>

## <center>Best Score : 0.78468<center>

**<font size="5"><center>17 Jul 2019</center></font>**


**Author:** João António - joaoantonio@ua.pt \& github.com/JoaoAnt/.


**The ipybn version and pdf can be found in:** the Github in the WaddlePortfolio/Projects.

**Made in:** Python 3 in the Jupyter Notebook software.

# Importing Data

In [1]:
import pandas as pd
import warnings
import numpy as np
warnings.filterwarnings('ignore')

train = pd.read_csv('train.csv')
holdout = pd.read_csv('test.csv')

In [2]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Defining Functions

In [3]:
# %load functions.py
def process_missing(df):
    """Handle various missing values from the data set
    Usage
    ------
    holdout = process_missing(holdout)
    """
    df["Fare"] = df["Fare"].fillna(train["Fare"].mean())
    df["Embarked"] = df["Embarked"].fillna("S")
    return df

def process_age(df):
    """Process the Age column into pre-defined 'bins' 
    Usage
    ------
    train = process_age(train)
    """
    df["Age"] = df["Age"].fillna(-0.5)
    cut_points = [-1,0,5,12,18,35,60,100]
    label_names = ["Missing","Infant","Child","Teenager","Young Adult","Adult","Senior"]
    df["Age_categories"] = pd.cut(df["Age"],cut_points,labels=label_names)
    return df

def process_fare(df):
    """Process the Fare column into pre-defined 'bins' 
    Usage
    ------
    train = process_fare(train)
    """
    cut_points = [-1,12,50,100,1000]
    label_names = ["0-12","12-50","50-100","100+"]
    df["Fare_categories"] = pd.cut(df["Fare"],cut_points,labels=label_names)
    return df

def process_cabin(df):
    """Process the Cabin column into pre-defined 'bins' 
    Usage
    ------
    train process_cabin(train)
    """
    df["Cabin_type"] = df["Cabin"].str[0]
    df["Cabin_type"] = df["Cabin_type"].fillna("Unknown")
    df = df.drop('Cabin',axis=1)
    return df

def process_titles(df):
    """Extract and categorize the title from the name column 
    Usage
    ------
    train = process_titles(train)
    """
    titles = {
        "Mr" :         "Mr",
        "Mme":         "Mrs",
        "Ms":          "Mrs",
        "Mrs" :        "Mrs",
        "Master" :     "Master",
        "Mlle":        "Miss",
        "Miss" :       "Miss",
        "Capt":        "Officer",
        "Col":         "Officer",
        "Major":       "Officer",
        "Dr":          "Officer",
        "Rev":         "Officer",
        "Jonkheer":    "Royalty",
        "Don":         "Royalty",
        "Sir" :        "Royalty",
        "Countess":    "Royalty",
        "Dona":        "Royalty",
        "Lady" :       "Royalty"
    }
    extracted_titles = df["Name"].str.extract(' ([A-Za-z]+)\.',expand=False)
    df["Title"] = extracted_titles.map(titles)
    return df

def process_fam(df):
    """Process the Family column into pre-defined 'bins' 
    Usage
    ------
    train = process_fam(train)
    """
    df["familysize"] = df[["SibSp","Parch"]].sum(axis=1)
    cut_points = [0,1,2,3,4,5,6,7,8,9,10,20]
    label_names = ["0fam","1fam","2fam","3fam","4fam","5fam","6fam","7fam","8fam","9fam","Manyfam"]
    df["Fam"] = pd.cut(df["familysize"],cut_points,labels=label_names)
    df = df.drop(columns=["familysize","SibSp","Parch"])
    return df

def create_dummies(df,column_name):
    """Create Dummy Columns (One Hot Encoding) from a single Column
    Usage
    ------
    train = create_dummies(train,"Age")
    """
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    df = df.drop(columns=[column_name])
    return df

def PrepareDF(df):
    df = process_missing(df)
    df = process_age(df)
    df = process_fare(df)
    df = process_titles(df)
    df = process_cabin(df)
    df = process_fam(df)
    
    df = create_dummies(df,'Age_categories')
    df = create_dummies(df,'Fare_categories')
    df = create_dummies(df,'Title')
    df = create_dummies(df,'Cabin_type')
    df = create_dummies(df,'Sex')
    df = create_dummies(df,'Fam')
    return df

def allDone():
  display(Audio(url='https://sound.peal.io/ps/audios/000/000/537/original/woo_vu_luvub_dub_dub.wav', autoplay=True))   

# Apply Functions

In [None]:
train = PrepareDF(train)
holdout = PrepareDF(holdout)

In [None]:
for col in train.columns: 
    print(col) 

PassengerId
Survived
Pclass
Name
Age
Ticket
Fare
Embarked
Age_categories_Missing
Age_categories_Infant
Age_categories_Child
Age_categories_Teenager
Age_categories_Young Adult
Age_categories_Adult
Age_categories_Senior
Fare_categories_0-12
Fare_categories_12-50
Fare_categories_50-100
Fare_categories_100+
Title_Master
Title_Miss
Title_Mr
Title_Mrs
Title_Officer
Title_Royalty
Cabin_type_A
Cabin_type_B
Cabin_type_C
Cabin_type_D
Cabin_type_E
Cabin_type_F
Cabin_type_G
Cabin_type_T
Cabin_type_Unknown
Sex_female
Sex_male
Fam_0fam
Fam_1fam
Fam_2fam
Fam_3fam
Fam_4fam
Fam_5fam
Fam_6fam
Fam_7fam
Fam_8fam
Fam_9fam
Fam_Manyfam


# Selecting the Best-Performing Features

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV

def select_features(df):
    # Remove non-numeric columns, columns that have null values
    df = df.select_dtypes([np.number]).dropna(axis=1)
    all_X = df.drop(["Survived","PassengerId"],axis=1)
    all_y = df["Survived"]
    
    clf = RandomForestClassifier(random_state=1)
    selector = RFECV(clf,cv=10)
    selector.fit(all_X,all_y)
    
    best_columns = list(all_X.columns[selector.support_])
    print("Best Columns \n"+"-"*12+"\n{}\n".format(best_columns))
    
    return best_columns

cols = select_features(train)

Best Columns 
------------
['Pclass', 'Age', 'Fare', 'Age_categories_Missing', 'Age_categories_Infant', 'Age_categories_Child', 'Age_categories_Teenager', 'Age_categories_Young Adult', 'Age_categories_Adult', 'Age_categories_Senior', 'Fare_categories_0-12', 'Fare_categories_12-50', 'Fare_categories_50-100', 'Fare_categories_100+', 'Title_Master', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Officer', 'Cabin_type_A', 'Cabin_type_B', 'Cabin_type_C', 'Cabin_type_D', 'Cabin_type_E', 'Cabin_type_F', 'Cabin_type_G', 'Cabin_type_Unknown', 'Sex_female', 'Sex_male', 'Fam_0fam', 'Fam_1fam', 'Fam_2fam', 'Fam_3fam', 'Fam_4fam', 'Fam_5fam', 'Fam_6fam', 'Fam_9fam']



# Selecting and Tuning Different Algorithms

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

def select_model(df,features):
    
    all_X = df[features]
    all_y = df["Survived"]

    # List of dictionaries, each containing a model name,
    # it's estimator and a dict of hyperparameters
    models = [
        {
            "name": "LogisticRegression",
            "estimator": LogisticRegression(),
            "hyperparameters":
                {
                    "solver": ["newton-cg", "lbfgs", "liblinear"]
                }
        },
        {
            "name": "KNeighborsClassifier",
            "estimator": KNeighborsClassifier(),
            "hyperparameters":
                {
                    "n_neighbors": range(1,20,1),
                    "weights": ["distance", "uniform"],
                    "algorithm": ["ball_tree", "kd_tree", "brute"],
                    "p": [1,2]
                }
        },
        {
            "name": "RandomForestClassifier",
            "estimator": RandomForestClassifier(random_state=1),
            "hyperparameters":
                {
                    "n_estimators": range(1,25,1),
                    "criterion": ["entropy", "gini"],
                    "max_depth": range(1,20,1),
                    "max_features": ["log2", "sqrt"],
                    "min_samples_leaf": [1, 5, 8],
                    "min_samples_split": [2, 3, 5]
                }
        }
    ]

    for model in models:
        print(model['name'])
        print('-'*len(model['name']))

        grid = GridSearchCV(model["estimator"],
                            param_grid=model["hyperparameters"],
                            cv=10)
        grid.fit(all_X,all_y)
        model["best_params"] = grid.best_params_
        model["best_score"] = grid.best_score_
        model["best_model"] = grid.best_estimator_

        print("Best Score: {}".format(model["best_score"]))
        print("Best Parameters: {}\n".format(model["best_params"]))

    return models

result = select_model(train,cols)

LogisticRegression
------------------
Best Score: 0.8204264870931538
Best Parameters: {'solver': 'liblinear'}

KNeighborsClassifier
--------------------




Best Score: 0.7755331088664422
Best Parameters: {'algorithm': 'ball_tree', 'n_neighbors': 7, 'p': 1, 'weights': 'uniform'}

RandomForestClassifier
----------------------


# Making a Submission to Kaggle

In [None]:
def save_submission_file(model,cols,filename="submission.csv"):
    holdout_data = holdout[cols]
    predictions = model.predict(holdout_data)
    
    holdout_ids = holdout["PassengerId"]
    submission_df = {"PassengerId": holdout_ids,
                 "Survived": predictions}
    submission = pd.DataFrame(submission_df)

    submission.to_csv(filename,index=False)

best_rf_model = result[2]["best_model"]
save_submission_file(best_rf_model,cols)

allDone()