# **Acquiring data**

In [1]:
import pandas as pd
import numpy as np

# acquire train data and test data
train_df = pd.read_csv('../input/titanic/train.csv')
test_df = pd.read_csv('../input/titanic/test.csv')
full_data = [train_df, test_df]

# **Processing and analyzing data**

In [2]:
# check data types
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [3]:
# preview the data
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


**1. Pclass**

There is no missing value in this feature and we can observe significant correlation between the Pclass and the Survived, so we should include the Pclass feature in our model.

In [4]:
train_df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean()

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


**2. Name**

Names are unique and may not directly contribute to the Survival, but the different titles in the names may be classified and checked on the correlation with the survival rate.

In [5]:
# extract title from name
for dataset in full_data:
    dataset['Title'] = dataset.Name.str.extract('([A-Za-z]+)\.', expand=False)

pd.crosstab(train_df['Title'], train_df['Sex'])

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,2
Countess,1,0
Don,0,1
Dr,1,6
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,40
Miss,182,0


In [6]:
# classify the titles and check the correlation with survival rate
for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(['Capt', 'Col', 'Countess', 'Don', 'Dr', 'Jonkheer', 'Lady', 'Major', 'Rev', 'Sir'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

Unnamed: 0,Title,Survived
0,Master,0.575
1,Miss,0.702703
2,Mr,0.156673
3,Mrs,0.793651
4,Rare,0.347826


**3. Sex**

The number of males is much more than the number of females both in the train data and the test data, but it can be confirmed that females have very high survival rate.

In [7]:
train_df[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean()

Unnamed: 0,Sex,Survived
0,female,0.742038
1,male,0.188908


**4. Age**

There are about 20% missing values in the Age data, so we may generate random data ranged between (mean-std) and (mean+std), then try to categorize the Age into 4 ranges and check the survival rate of each range in the train data.

In [8]:
for dataset in full_data:
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)
    
train_df['Age_range'] = pd.cut(train_df['Age'], 4)
train_df[['Age_range', 'Survived']].groupby(['Age_range'], as_index=False).mean()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,Age_range,Survived
0,"(-0.08, 20.0]",0.423963
1,"(20.0, 40.0]",0.369352
2,"(40.0, 60.0]",0.398601
3,"(60.0, 80.0]",0.227273


**5. SibSp and Parch**

Based on the SibSp which indicates the number of Siblings/Spouses aboard and the Parch which indicates the number of Parents/Children aboard, we can create a new feature called FamilySize which combines the SibSp and the Parch.

In [9]:
for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

train_df[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean()

Unnamed: 0,FamilySize,Survived
0,1,0.303538
1,2,0.552795
2,3,0.578431
3,4,0.724138
4,5,0.2
5,6,0.136364
6,7,0.333333
7,8,0.0
8,11,0.0


We may try to create another feature called IsAlone and check the corresponding survival rate.

In [10]:
for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
    
train_df[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean()

Unnamed: 0,IsAlone,Survived
0,0,0.50565
1,1,0.303538


**6. Ticket**

Ticket has high ratio of duplicates and is little correlated with the survival rate, so it may be not included in our analysis.

**7. Fare**

As Fare has one missing value in the test data, first we try to generate a median number to replace the missing one, then categorize the ticket data in the train data into 4 ranges to check the survival rate.

In [11]:
test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True)

train_df['Fare_range'] = pd.qcut(train_df['Fare'], 4)
train_df[['Fare_range', 'Survived']].groupby(['Fare_range'], as_index=False).mean()

Unnamed: 0,Fare_range,Survived
0,"(-0.001, 7.91]",0.197309
1,"(7.91, 14.454]",0.303571
2,"(14.454, 31.0]",0.454955
3,"(31.0, 512.329]",0.581081


**8. Cabin**

As Cabin feature has more than 70% missing values both in train data and test data, and one cabin is shared by many passengers, it is considered to be excluded from our analysis.

**9. Embarked**

Embarked feature takes S, C, Q values based on port of embarkation. It has 2% missing values in the train data, and we can try to fill the missing values with the most occurred value S (72% and 65% in train data and test data respectively).

In [12]:
for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
    
train_df[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean()

Unnamed: 0,Embarked,Survived
0,C,0.553571
1,Q,0.38961
2,S,0.339009


Now we try to covert categorical values into numerical values, and then drop unnecessary features.

In [13]:
for dataset in full_data:
    
    # mapping Title
    dataset['Title'] = dataset['Title'].map({"Master": 1, "Miss": 2, "Mr": 3, "Mrs": 4, "Rare": 5})
    dataset['Title'] = dataset['Title'].fillna(0)
    
    # mapping Sex
    dataset['Sex'] = dataset['Sex'].map({'female': 0, 'male': 1}).astype(int)
    
    # mapping Age
    dataset.loc[dataset['Age'] <= 20, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 20) & (dataset['Age'] <= 40), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 40) & (dataset['Age'] <= 60), 'Age'] = 2
    dataset.loc[dataset['Age'] > 60, 'Age'] = 3
    
    # mapping Fare
    dataset.loc[dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31.0), 'Fare'] = 2
    dataset.loc[dataset['Fare'] > 31.0, 'Fare'] = 3
    
    # mapping Embarked
    dataset['Embarked'] = dataset['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)
    
drop_list = ['Name', 'SibSp', 'Parch', 'FamilySize', 'Ticket', 'Cabin']
train_df = train_df.drop(drop_list, axis = 1)
train_df = train_df.drop(['PassengerId', 'Age_range', 'Fare_range'], axis = 1)

test_df = test_df.drop(drop_list, axis = 1)

train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,IsAlone
0,0,3,1,1,0.0,0,3,0
1,1,1,0,1,3.0,1,4,0
2,1,3,0,1,1.0,0,2,1
3,1,1,0,1,3.0,0,4,0
4,0,3,1,1,1.0,0,3,1


# **Model and prediction**

Now we are ready to build and train a model to perform prediction. As we aim at identifying a relationship between multiple inputs (such as Pclass, Sex, Age...) and the output (i.e., survival), we'll try a few common used machine learning models such as Logistic Regression, SVM, Random Forest, KNN, and Decision Tree.

In [14]:
# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# prepare train data and test data for model
X_train = train_df.drop('Survived', axis = 1)
Y_train = train_df['Survived']
X_test = test_df.drop('PassengerId', axis = 1).copy()
X_train.shape, Y_train.shape, X_test.shape

((891, 7), (891,), (418, 7))

In [15]:
# Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred_logreg = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train)*100, 2)

# SVM
svc = SVC()
svc.fit(X_train, Y_train)
Y_pred_svm = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train)*100, 2)

# Random Forest
random_forest = RandomForestClassifier(n_estimators = 100)
random_forest.fit(X_train, Y_train)
Y_pred_random_forest = random_forest.predict(X_test)
acc_random_forest = round(random_forest.score(X_train, Y_train)*100, 2)

# KNN
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred_KNN = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train)*100, 2)

# Decision Tree
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred_decision_tree = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train)*100, 2)

# model evaluation
models = pd.DataFrame({'Model': ['Logistic Regression', 'SVM', 'Random Forest', 'KNN', 'Decision Tree'], 'Score': [acc_log, acc_svc, acc_random_forest, acc_knn, acc_decision_tree]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
2,Random Forest,87.09
4,Decision Tree,87.09
3,KNN,83.5
1,SVM,82.04
0,Logistic Regression,78.68


According to the above comparison, we may choose Random Forest or Decision Tree Classifier to predict the data.

In [16]:
submission = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': Y_pred_decision_tree})
submission.to_csv('submission.csv', index=False)
print('Your submission was successfully saved!')

Your submission was successfully saved!
