In [1]:
import pandas as pd

#Load files
X_full = pd.read_csv("/home/manu/Documents/Data Science/Projects/Titanic/train.csv", index_col = "PassengerId")
X_test_full = pd.read_csv("/home/manu/Documents/Data Science/Projects/Titanic/test.csv", index_col = "PassengerId")


Data analysis and feature engineering is based on Kaggle Notebook: 
https://www.kaggle.com/zlatankr/titanic-random-forest-82-78

# General analysis

In [2]:
X_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [3]:
X_test_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Name      418 non-null    object 
 2   Sex       418 non-null    object 
 3   Age       332 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Ticket    418 non-null    object 
 7   Fare      417 non-null    float64
 8   Cabin     91 non-null     object 
 9   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 35.9+ KB


In [4]:
X_full['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [5]:
X_full['Survived'].mean()

0.3838383838383838

In [6]:
X_full.head(10)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


### General analysis conclusions

1) Survival rate: 38.38% (342/549)<br>
2) Columns with missing values in training data: Age, Cabin / in test data: Age, Fare, Cabin<br>
3) Categorical Columns: Name, Sex, Ticket, Cabin, Embarked<br>

# Data Analysis + feature engineering + Imputation

1) Feature by feature it will be analized its impact on the target (Data Analysis)
2) Meaningful new features will be created in case it's possible (Feature engineering)
3) Imputation of missing values will be done in case of needed

### Pclass

In [7]:
X_full['Survived'].groupby(X_full['Pclass']).mean()

Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Survived, dtype: float64

Pclass conclusions:
Pclass is a strong predictor as first and second class survival rate is way above General survival rate
and there is survival rate decrement in every class.

### Name

From the name two new features will be created and validated as new features/predictors: Name lenght (more
important people had longer names and more survival chances) and Name title (Title also tells about marriage/family status and people importance)

In [8]:
#Name length
X_full['Name_length'] = X_full['Name'].apply( lambda x: len(x))

df_aux = pd.DataFrame(index = pd.qcut(X_full['Name_length'],4).unique().sort_values())

df_aux.index.name = 'Length group'

df_aux['Cases in group'] = X_full['Survived'].groupby(pd.qcut(X_full['Name_length'],4)).count()

df_aux['Survival rate'] = X_full['Survived'].groupby(pd.qcut(X_full['Name_length'],4)).mean()

df_aux

Unnamed: 0_level_0,Cases in group,Survival rate
Length group,Unnamed: 1_level_1,Unnamed: 2_level_1
"(11.999, 20.0]",243,0.230453
"(20.0, 25.0]",215,0.325581
"(25.0, 30.0]",211,0.364929
"(30.0, 82.0]",222,0.626126


There is a clear relation where the longer the case the highest the survival rate is proving Name_length to
be a useful feature/predictor

In [9]:
#Name title

X_full['Name_title'] = X_full['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())

df_aux = pd.DataFrame(index = X_full['Name_title'].unique())

df_aux.index.name = "Name_Tittle"

df_aux['Qty of cases'] = X_full['Survived'].groupby(X_full['Name_title']).count()

df_aux ['Survival rate'] = X_full['Survived'].groupby(X_full['Name_title']).mean()

df_aux.sort_values(by = ['Survival rate'],ascending=False, inplace=True)

df_aux

Unnamed: 0_level_0,Qty of cases,Survival rate
Name_Tittle,Unnamed: 1_level_1,Unnamed: 2_level_1
Ms,1,1.0
Mlle,2,1.0
the Countess,1,1.0
Mme,1,1.0
Lady,1,1.0
Sir,1,1.0
Mrs,125,0.792
Miss,182,0.697802
Master,40,0.575
Col,2,0.5


It can be noticed a clear high survival rate for tittles with a lot of cases like 'Mrs' or 'Miss' as low
survival rates for titles like 'Mr' therefore Name title is proving to be a good predictor.
Titles with less than 5 cases could be grouped in an 'Other' group and a flag column 'Other title' could be added.
For the moment the 'other' approach won't be taken

In [10]:
def Name_Engineering (train, test):
    """Name columns will be used to create two new columns:
    1) Name_length: The length of the name
    2) Name_title: The title of the name

    Original Name column will be dropped
    Transformation will be done in both dataframes (train and test)

    >>> Name: 'Braund, Mr. Owen Harris' --> Name_length: 23 , Name_title: 'Mr'"""

    for i in [train, test]:
        i['Name_length'] = i['Name'].apply( lambda x: len(x))
        i['Name_title'] = i['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
        i.drop(labels=['Name'],axis=1, inplace=True)
    return train, test

### Sex

In [11]:
X_full['Survived'].groupby(X_full['Sex']).mean()

Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64

There is a high survival rate for females over males which makes Sex a good predictor as it is

### Age

In [12]:
X_full['Survived'].groupby(pd.qcut(X_full['Age'], 5)).mean()

Age
(0.419, 19.0]    0.481707
(19.0, 25.0]     0.328467
(25.0, 31.8]     0.393701
(31.8, 41.0]     0.437500
(41.0, 80.0]     0.373239
Name: Survived, dtype: float64

There isn't a clear trend of survival rate per age as I would expect. I will combine the age with other
features like Sex, SibSp and Parch to see if there is a relationship at a more detailed level

In [13]:
#Checking Age survival rate per sex
DF_males = X_full[X_full['Sex'] == 'male']

print('Males survival rate\n',DF_males['Survived'].groupby(pd.qcut(DF_males['Age'],5)).mean())

DF_females = X_full[X_full['Sex'] == 'female']

print('Females survival rate \n', DF_females['Survived'].groupby(pd.qcut(DF_females['Age'],5)).mean())

Males survival rate
 Age
(0.419, 20.0]    0.284314
(20.0, 26.0]     0.122222
(26.0, 32.0]     0.258824
(32.0, 42.0]     0.181818
(42.0, 80.0]     0.170455
Name: Survived, dtype: float64
Females survival rate 
 Age
(0.749, 17.0]    0.690909
(17.0, 24.0]     0.758065
(24.0, 30.0]     0.707317
(30.0, 40.0]     0.836364
(40.0, 63.0]     0.770833
Name: Survived, dtype: float64


Still no clear relation between age - Sex - Survival rate is noticed. Will be kept as it might be useful in
deeper levels in relations with other features but a side test will be performed dropping it to see how 
it performs.

In [14]:
#Age missing values imputing
def Age_impute (train, test):
    """Impute the missing values using the median value of the respective 'Pclass' and 'Sex' in train. 
    Also a flag column will be added to mark those rows that were imputed.
    Grouping categories for train and test must be the same and exists in both
    Example >>> if the mean age for 'male' and Pclass '2' is 26, then a male/Pclass '2' with Nan age will be 
    imputed with 26 and 1 in the flag colum"""
    #Get the median for the Age by Pclass and Sex
    DF_grouped = train.groupby(['Pclass', 'Sex'])['Age'].median()
    for i in [train, test]:
        #Add null flag column
        i['Age_Null_Flag'] = i['Age'].apply(lambda x: 1 if pd.isnull(x) else 0)
        #Impute train's median age values by Pclass and Sex to train and test missing values
        #x.name : brings the specific row number
        #i.loc[x.name]['Pclass'] : Brings the Pclass value for the specific row
        #i.loc[x.name]['Sex'] : Brings the sex value for the specific row
        #DF_grouped.loc [][] : Brings the median of an specific group ([Pclass][Sex])
        i['Age'] = i.apply(lambda x: (DF_grouped.loc[i.loc[x.name]['Pclass']][i.loc[x.name]['Sex']]) if pd.isnull(x['Age']) else x['Age'], axis=1)
    return train, test

### SibSp

In [15]:
#SibSp

DF_aux = pd.DataFrame (index = X_full['SibSp'].unique().sort())
DF_aux.index.name = 'SibSp'
DF_aux['Qty'] = X_full.groupby(['SibSp'])['Survived'].count()
DF_aux['Survival rate'] = X_full.groupby(['SibSp'])['Survived'].mean()
DF_aux

Unnamed: 0_level_0,Qty,Survival rate
SibSp,Unnamed: 1_level_1,Unnamed: 2_level_1
0,608,0.345395
1,209,0.535885
2,28,0.464286
3,16,0.25
4,18,0.166667
5,5,0.0
8,7,0.0


### Parch

In [16]:
DF_aux = pd.DataFrame (index = X_full['Parch'].unique().sort())
DF_aux.index.name = 'Parch'
DF_aux['Qty'] = X_full.groupby(['Parch'])['Survived'].count()
DF_aux['Survival rate'] = X_full.groupby(['Parch'])['Survived'].mean()
DF_aux

Unnamed: 0_level_0,Qty,Survival rate
Parch,Unnamed: 1_level_1,Unnamed: 2_level_1
0,678,0.343658
1,118,0.550847
2,80,0.5
3,5,0.6
4,4,0.0
5,5,0.2
6,1,0.0


Those with 1 or 2 Parent/children or Siblins/Spouses have a higher survival rate than those with higher number.
These features will be left as they are to be used in the model

### Ticket

Two features will be created from the ticket:
1) Ticket Length: Ticket length might indicate the type of ticket therefore the position of the passanger and how 
                  close to the ship surface it was which might have impacted survival rate
2) Ticket first letter: Also might inidicate ticket type

In [17]:
#Ticket Length

DF_aux = pd.DataFrame(index = X_full['Ticket'].apply(lambda x: len(x)).unique().sort())
DF_aux.index.name = 'Length'
DF_aux['Qty of cases'] = X_full.groupby(X_full['Ticket'].apply(lambda x : len(x)))['Survived'].count()
DF_aux['Survival rate'] = X_full.groupby(X_full['Ticket'].apply(lambda x : len(x)))['Survived'].mean()

DF_aux

Unnamed: 0_level_0,Qty of cases,Survival rate
Length,Unnamed: 1_level_1,Unnamed: 2_level_1
3,2,0.0
4,101,0.366337
5,131,0.618321
6,419,0.319809
7,27,0.296296
8,76,0.539474
9,26,0.192308
10,41,0.341463
11,8,0.25
12,10,0.4


In [18]:
#Ticket first letter

DF_aux = pd.DataFrame (index = (X_full['Ticket'].apply(lambda x: x[0])).unique().sort())

DF_aux.index.name = 'First Letter'

DF_aux['Qty of cases'] = X_full.groupby(X_full['Ticket'].apply(lambda x: x[0]))['Survived'].count()

DF_aux['Survival rate'] = X_full.groupby(X_full['Ticket'].apply(lambda x: x[0]))['Survived'].mean()

DF_aux

Unnamed: 0_level_0,Qty of cases,Survival rate
First Letter,Unnamed: 1_level_1,Unnamed: 2_level_1
1,146,0.630137
2,183,0.464481
3,301,0.239203
4,10,0.2
5,3,0.0
6,6,0.166667
7,9,0.111111
8,2,0.0
9,1,1.0
A,29,0.068966


In both cases (length and first letter) there are certain values which have a higher or lower survival rates than 
the average population therefore there is some predictive power from these new features

In [19]:
def Ticket_engineering (train, test):
    """Receives DataFrames with the ticket column, it creates the new features Ticket_length and Ticket_1Letter,
    deletes original ticket column and returns both DataFrames
    Ticket example: 'A/5 21171' >>> Ticket_length = 9 and Ticket_1Letter = 'A' """
    for i in [train, test]:
        i['Ticket_length'] = i['Ticket'].apply(lambda x: len(x))
        i['Ticket_1Letter'] = i['Ticket'].apply(lambda x: x[0])
        del i['Ticket']
    return train, test

### Fare

In [20]:
def Fare_imputing (train, test):
    """Impute the missing values using the median value of the respective 'Pclass' and 'Sex' in train. 
    Also a flag column will be added to mark those row that were imputed.
    Grouping categories for train and test must be the same and exists in both
    Example >>> if the mean Fare for 'male' and Pclass '2' is 13, then a male/Pclass '2' with Nan age will be 
    imputed with 13"""
    #Get the median for the Age by Pclass and Sex
    DF_grouped = train.groupby(['Pclass', 'Sex'])['Fare'].median()
    for i in [train, test]:
        #Impute train's median age values by Pclass and Sex to train and test missing values
        #i.loc[x.name]['Pclass'] : Brings the Pclass value for the specific row
        #i.loc[x.name]['Sex'] : Brings the sex value for the specific row
        #DF_grouped.loc [][] : Brings the median of an specific group ([Pclass][Sex])
        i['Fare'] = i.apply(lambda x: (DF_grouped.loc[i.loc[x.name]['Pclass']][i.loc[x.name]['Sex']]) if pd.isnull(x['Fare']) else x['Fare'], axis=1)
    return train, test

### Cabin

In [21]:
print("Null values: ", X_full['Cabin'].isnull().sum())
print("Total values: ", X_full['Cabin'].shape[0])

Null values:  687
Total values:  891


In [22]:
print("Cabin Nan survival rate: ",round(X_full[X_full['Cabin'].isnull()]['Survived'].mean(),2))

print("Cabin Not Nan survival rate: ", round(X_full[X_full['Cabin'].notnull()]['Survived'].mean(),2))

Cabin Nan survival rate:  0.3
Cabin Not Nan survival rate:  0.67


Survival rate for those passangers with Cabin number is much higher with those without it so may be those cabins
in lower levels hadn't numbers and the survival chance in lower levels was lower making the cabin a possible useful feature.<br> <br>

Two features will be created: <br>
1) Cabin first letter<br>
2) Cabin Number (With number intervals or 0 for those without data)

In [23]:
def cabin_1letter (train, test):
    """Gets the first letter of the cabin, if NaN completes it with '0'. If there is more than one cabin for
    the passanger, the first letter from the first cabin will be kept"""
    for i in [train, test]:
        i['Cabin_1letter'] = i['Cabin'].apply(lambda x: '0' if pd.isnull(x) else x[0])
    return train, test

In [24]:
def cabin_number (train, test):
    """Gets the cabin number with intervals. NaN values will be set to an special interval (-0.1 , 0.1]"""
    for i in [train, test]:
        i['Cabin_number'] = i['Cabin'].apply(lambda x: 0 if pd.isnull(x) else x.split(' ')[-1][1:])
        i['Cabin_number'] = i['Cabin_number'].apply(lambda x: 0 if x =='' else x)
        i['Cabin_number'] = i['Cabin_number'].apply(lambda x: int(x))
        #Making intervals for the cabins with numbers
        #Specific interval is created manually to have an specific interval for the NaN (0) values (-0.1,0.1)
        aux_bins = pd.IntervalIndex.from_tuples([(-0.1,0.1),(0.99, 29), (29, 66), (66, 148)])
        i['Cabin_number'] = pd.cut(i['Cabin_number'],bins = aux_bins)
    return train, test

In [25]:
def cabin_prep (train, test):
    """Gets the Cabin First letter and cabin number using cabin_1letter and cabin_number respectively and
    drops the Cabin column"""
    train, test = cabin_1letter (train, test)
    train, test = cabin_number (train, test)
    del train['Cabin']
    del test['Cabin']
    return train, test

### Embarked

In [26]:
X_full['Survived'].groupby(X_full['Embarked']).mean()

Embarked
C    0.553571
Q    0.389610
S    0.336957
Name: Survived, dtype: float64

In [27]:
print("Embarked missing values in train data: ", X_full['Embarked'].isnull().sum())
print("Embarked missing values in train data: ", X_test_full['Embarked'].isnull().sum())

Embarked missing values in train data:  2
Embarked missing values in train data:  0


People embarked in Cherbourg (C) had higher survival rates then this feature is a useful predictor.
Due to the few missing values (only two), a most frequent imputing method will be used

In [28]:
from sklearn.impute import SimpleImputer
def embarked_prep (train,test):
    Imputer = SimpleImputer(strategy = 'most_frequent')
    Imputer.fit(train[['Embarked']])
    for i in [train, test]:
        i[['Embarked']] = Imputer.transform(i[['Embarked']])
    return train, test
        

# Encoding

Encoding all categorical columns with the One-Hot Encoding method

In [29]:
from sklearn.preprocessing import OneHotEncoder

def One_Hot_cat (train, test):
    """One hot encode all the categorical variables and remove the original columns"""
    cat_cols = train.select_dtypes(include=['object','category']).columns
    OH_encoder = OneHotEncoder(handle_unknown = 'ignore', sparse = False)
    OH_encoder.fit(train[cat_cols])
    
    # Encoding for train and data
    DF_OH_train = pd.DataFrame(OH_encoder.transform(train[cat_cols]))
    DF_OH_test = pd.DataFrame(OH_encoder.transform(test[cat_cols]))
    
    # One-hot encoding removed indexes; putting them back
    DF_OH_train.index = train.index
    DF_OH_test.index = test.index
    
    #Adding One-Hot columns names
    DF_OH_train.columns = OH_encoder.get_feature_names(cat_cols)
    DF_OH_test.columns = OH_encoder.get_feature_names(cat_cols)
    
    #Removing the blanc spaces in columns name as it causes problems to XGBoost
    DF_OH_train.columns = [cname.replace(" ","") for cname in DF_OH_train.columns]
    DF_OH_test.columns = [cname.replace(" ","") for cname in DF_OH_test.columns]
    
    DF_OH_train.columns = [cname.replace("]","") for cname in DF_OH_train.columns]
    DF_OH_test.columns = [cname.replace("]","") for cname in DF_OH_test.columns]
    
    # Adding the encoded columns
    train = pd.concat([train,DF_OH_train], axis = 1)
    test = pd.concat([test,DF_OH_test], axis = 1)
    
    #Removing the original categorical columns
    train.drop(cat_cols, inplace = True, axis = 1)
    test.drop(cat_cols, inplace = True, axis = 1)
 
    return train, test


# Pre-processing 

Preprocessing the DataFrames

In [30]:
X_full, X_test_full = Name_Engineering(X_full, X_test_full)

X_full, X_test_full = Age_impute(X_full, X_test_full)

X_full, X_test_full = Ticket_engineering(X_full, X_test_full)

X_full, X_test_full = Fare_imputing (X_full, X_test_full)

X_full, X_test_full = cabin_prep (X_full, X_test_full)

X_full, X_test_full = embarked_prep(X_full, X_test_full)

X_full, X_test_full  = One_Hot_cat(X_full, X_test_full)



# Hyperparameter Tuning

In [31]:
#XGBClassifier

from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

y = X_full['Survived']
X = X_full.drop(['Survived'], inplace = False, axis = 1)

My_model = XGBClassifier(use_label_encoder=False)

param_grid = { "learning_rate" : [0.01, 0.05, 0.1, 0.15], "n_estimators" : [1, 5, 7, 10 , 25 , 50]}

GS = GridSearchCV(estimator = My_model, param_grid = param_grid, scoring = 'accuracy', cv = 5, n_jobs = -1)

results = GS.fit(X, y)



In [32]:
print("Best score: " , results.best_score_)
print("Best Parameters: ",results.best_params_)


Best score:  0.8338899001945892
Best Parameters:  {'learning_rate': 0.05, 'n_estimators': 25}


#Best result so far:

My_model = XGBClassifier(n_estimators=25, learning_rate=0.05)
0.8338899001945892

In [33]:
#RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

y = X_full['Survived']
X = X_full.drop(['Survived'], inplace = False, axis = 1)

My_model = RandomForestClassifier(random_state = 0)

param_grid = param_grid = { "criterion" : ["gini", "entropy"], "min_samples_leaf" : [1, 5, 10], 
                           "min_samples_split" : [2, 4, 10, 12, 16], "n_estimators": [50, 100, 400, 700, 1000]}


GS = GridSearchCV(estimator = My_model, param_grid = param_grid, scoring = 'accuracy', cv = 5, n_jobs = -1)

results = GS.fit(X, y)





In [34]:
print("Best score: " , results.best_score_)
print("Best Parameters: ",results.best_params_)


Best score:  0.8384219446362439
Best Parameters:  {'criterion': 'gini', 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 50}


# Getting predictions

In [40]:
#fit model and get predictions

My_model = RandomForestClassifier(criterion = 'gini', min_samples_leaf= 1, min_samples_split= 4, n_estimators= 100)

My_model.fit(X, y )

#Get prediction for test
Test_prediction = My_model.predict(X_test_full)

#Prepare data to be saved in a CSV

Test_prediction = pd.DataFrame({'Survived':Test_prediction}, index = X_test_full.index)

#save data to CSV 
Test_prediction.to_csv ("/home/manu/Documents/Data Science/Projects/Titanic/result_featureEng.csv")

