In [2]:
#Packages
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import re #Regular Expressions
%matplotlib inline

In [24]:
#import
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [25]:
#Combine train & test
titanic = pd.concat([train,test], sort = False)
titanic.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,1309.0,891.0,1309.0,1046.0,1309.0,1309.0,1308.0
mean,655.0,0.383838,2.294882,29.881138,0.498854,0.385027,33.295479
std,378.020061,0.486592,0.837836,14.413493,1.041658,0.86556,51.758668
min,1.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,328.0,0.0,2.0,21.0,0.0,0.0,7.8958
50%,655.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,982.0,1.0,3.0,39.0,1.0,0.0,31.275
max,1309.0,1.0,3.0,80.0,8.0,9.0,512.3292


In [26]:
#Check for missing values
titanic.isnull().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

In [27]:
#Treat Numericals
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())
titanic['Fare'] = titanic['Fare'].fillna(titanic['Fare'].median())

In [28]:
#Treat Embarked
#titanic['Embarked'].value_counts().idxmax()
titanic['Embarked'] = titanic['Embarked'].fillna(titanic['Embarked'].value_counts().idxmax())

In [29]:
#Treat Cabin
titanic['Cabin'] = titanic['Cabin'].fillna('Missing')

In [30]:
titanic['Cabin'] = titanic['Cabin'].str[0]
titanic['Cabin'].value_counts()

M    1014
C      94
B      65
D      46
E      41
A      22
F      21
G       5
T       1
Name: Cabin, dtype: int64

In [31]:
#Encode Sex 
titanic['Sex'] = titanic['Sex'].map({'male': 1, 'female': 0})
titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
PassengerId    1309 non-null int64
Survived       891 non-null float64
Pclass         1309 non-null int64
Name           1309 non-null object
Sex            1309 non-null int64
Age            1309 non-null float64
SibSp          1309 non-null int64
Parch          1309 non-null int64
Ticket         1309 non-null object
Fare           1309 non-null float64
Cabin          1309 non-null object
Embarked       1309 non-null object
dtypes: float64(3), int64(5), object(4)
memory usage: 132.9+ KB


### New Features

In [18]:
titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch'] + 1

titanic['IsAlone'] = 0
titanic.loc[titanic['FamilySize'] == 1, 'IsAlone'] = 1

In [33]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
PassengerId    1309 non-null int64
Survived       891 non-null float64
Pclass         1309 non-null int64
Name           1309 non-null object
Sex            1309 non-null int64
Age            1309 non-null float64
SibSp          1309 non-null int64
Parch          1309 non-null int64
Ticket         1309 non-null object
Fare           1309 non-null float64
Cabin          1309 non-null object
Embarked       1309 non-null object
dtypes: float64(3), int64(5), object(4)
memory usage: 132.9+ KB


In [35]:
#Extract Title
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

In [36]:
titanic['Title'] = titanic['Name'].apply(get_title)

In [38]:
titanic.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0.0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,M,S,Mr


In [39]:
#Reduce Categories
titanic['Title'] = titanic['Title'].replace(['Capt', 'Col','Dr', 'Major', 'Rev'], 'Officer')
titanic['Title'] = titanic['Title'].replace(['Lady', 'Countess','Don', 'Sir', 'Jonkheer', 'Dona'], 'Royalty')
titanic['Title'] = titanic['Title'].replace(['Mlle', 'Ms'], 'Miss')
titanic['Title'] = titanic['Title'].replace('Mme', 'Mrs')

In [40]:
titanic['Title'].value_counts()

Mr         757
Miss       264
Mrs        198
Master      61
Officer     23
Royalty      6
Name: Title, dtype: int64

In [42]:
#Extract Ticket
def get_ticket(ticket):
    txt = ticket.replace("/","")
    txt = txt.replace(".","")
    txt = txt.upper()
    
    if txt.isdigit():
        return 'xxx'
    else:
        title_search = re.search(r'[A-Z]+[A-Z0-9]+|[A-Z]|[A-Za-z]', txt)
        # If the title exists, extract and return it.
        if title_search:
            return title_search.group(0)
        return ""

In [43]:
titanic['Ticket'] = titanic['Ticket'].apply(get_ticket)

In [44]:
titanic['Ticket'].value_counts()

xxx        957
PC          92
CA          68
A5          28
SOTONOQ     24
SCPARIS     19
WC          15
STONO       14
A4          10
FCC          9
C            8
SOC          8
SOPP         7
STONO2       7
SCAH         5
WEP          4
LINE         4
PP           4
SOTONO2      3
FC           3
PPP          2
SC           2
SCA4         2
SWPP         2
AQ3          1
AS           1
CASOTON      1
SCA3         1
A            1
AQ4          1
LP           1
STONOQ       1
SOP          1
SCOW         1
FA           1
SP           1
Name: Ticket, dtype: int64

### Prepare Data for Modelling

In [45]:
titanic_all = titanic.drop(['PassengerId','Name','SibSp','Parch'],axis=1)

In [46]:
titanic_dummies=pd.get_dummies(titanic_all, drop_first=True)
titanic_dummies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 55 columns):
Survived          891 non-null float64
Pclass            1309 non-null int64
Sex               1309 non-null int64
Age               1309 non-null float64
Fare              1309 non-null float64
Ticket_A4         1309 non-null uint8
Ticket_A5         1309 non-null uint8
Ticket_AQ3        1309 non-null uint8
Ticket_AQ4        1309 non-null uint8
Ticket_AS         1309 non-null uint8
Ticket_C          1309 non-null uint8
Ticket_CA         1309 non-null uint8
Ticket_CASOTON    1309 non-null uint8
Ticket_FA         1309 non-null uint8
Ticket_FC         1309 non-null uint8
Ticket_FCC        1309 non-null uint8
Ticket_LINE       1309 non-null uint8
Ticket_LP         1309 non-null uint8
Ticket_PC         1309 non-null uint8
Ticket_PP         1309 non-null uint8
Ticket_PPP        1309 non-null uint8
Ticket_SC         1309 non-null uint8
Ticket_SCA3       1309 non-null uint8
Ticket_SCA4    