In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [27]:
titanic = pd.read_csv("titanic_data.csv")
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [28]:
titanic.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [29]:
class_category = pd.api.types.CategoricalDtype(categories=[3,2,1], ordered=True)
titanic.Pclass = titanic.Pclass.astype(class_category)

In [30]:
titanic.SibSp.value_counts(dropna=False)

0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dtype: int64

In [31]:
sibling_category = pd.CategoricalDtype(categories=[i for i in range(9)], ordered=True)
titanic.SibSp = titanic.SibSp.astype(sibling_category)

In [32]:
sex_category = pd.CategoricalDtype(categories=['male', 'female'])
titanic.Sex = titanic.Sex.astype(sex_category)

In [53]:
titanic.SibSp

0      1
1      1
2      0
3      1
4      0
      ..
886    0
887    0
888    1
889    0
890    0
Name: SibSp, Length: 891, dtype: category
Categories (9, int64): [0 < 1 < 2 < 3 ... 5 < 6 < 7 < 8]

In [34]:
titanic.dtypes

PassengerId       int64
Survived          int64
Pclass         category
Name             object
Sex            category
Age             float64
SibSp          category
Parch             int64
Ticket           object
Fare            float64
Cabin            object
Embarked         object
dtype: object

In [35]:
titanic[titanic.Pclass > 3]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S
...,...,...,...,...,...,...,...,...,...,...,...,...
880,881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25.0,0,1,230433,26.0000,,S
883,884,0,2,"Banfield, Mr. Frederick James",male,28.0,0,0,C.A./SOTON 34068,10.5000,,S
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S


In [36]:
age_bins = [0, 18, 40, 65,120]
age_bin_labels = ['Child', 'Adult', 'Middle Aged', 'Senior']

titanic.insert(value=pd.cut(titanic.Age, bins=age_bins, labels=age_bin_labels), loc=6, column='Age Range')

In [37]:
titanic.dtypes

PassengerId       int64
Survived          int64
Pclass         category
Name             object
Sex            category
Age             float64
Age Range      category
SibSp          category
Parch             int64
Ticket           object
Fare            float64
Cabin            object
Embarked         object
dtype: object

In [38]:
titanic[['Age', 'Age Range']].sample(5)

Unnamed: 0,Age,Age Range
549,8.0,Child
246,25.0,Adult
540,36.0,Adult
722,34.0,Adult
826,,


In [39]:
titanic[titanic['Age Range'] > 'Middle Aged'].Survived.mean()

0.125

In [40]:
survivability_by_age_class_sex = titanic.groupby(['Age Range', 'Pclass', 'Sex'], as_index=False).agg([np.mean, 'count', np.sum])['Survived'].sort_values(by='mean', ascending=False)

survivability_by_age_class_sex.rename(columns={'mean': 'pct survived', 'sum': 'no of survivors', 'count': 'total no of people'}, inplace=True)

In [41]:
survivability_by_age_class_sex.shape

(24, 3)

In [42]:
survivability_by_age_class_sex.reset_index(inplace=True)

In [43]:
survivability_by_age_class_sex

Unnamed: 0,Age Range,Pclass,Sex,pct survived,total no of people,no of survivors
0,Child,2,female,1.0,14,14
1,Adult,1,female,0.979167,48,47
2,Middle Aged,1,female,0.961538,26,25
3,Adult,2,female,0.914894,47,43
4,Child,1,female,0.909091,11,10
5,Middle Aged,2,female,0.846154,13,11
6,Child,1,male,0.8,5,4
7,Child,2,male,0.6,15,9
8,Child,3,female,0.511628,43,22
9,Adult,3,female,0.48,50,24


In [44]:
survivability_by_age_class_sex.Pclass.corr(survivability_by_age_class_sex['pct survived'])

-0.5064272698396611

In [45]:
titanic.Embarked.value_counts(dropna=False)

S      644
C      168
Q       77
NaN      2
Name: Embarked, dtype: int64

In [46]:
titanic.Embarked = titanic.Embarked.map({'C': 'Cherbourg', 'Q': 'Queenstown', 'S': 'Southampton'})

In [47]:
titanic.Embarked.head()

0    Southampton
1      Cherbourg
2    Southampton
3    Southampton
4    Southampton
Name: Embarked, dtype: object

In [48]:
embark_group = titanic.groupby(['Embarked'], as_index=False)

In [49]:
embark_group.Survived.mean()

Unnamed: 0,Embarked,Survived
0,Cherbourg,0.553571
1,Queenstown,0.38961
2,Southampton,0.336957
