# Task 2 example solution

In [136]:
import os

# get current working directory
cwd = os.getcwd()

# Import pandas library with alias pd
import pandas as pd

# Load up our dataset, due to solution file location we need to go one directory up
df = pd.read_csv('{}/../datasets/titanic.csv'.format(cwd))

In [137]:
# Look at a chunk of data
df[100:200]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
100,101,0,3,"Petranec, Miss. Matilda",female,28.0,0,0,349245,7.8958,,S
101,102,0,3,"Petroff, Mr. Pastcho (""Pentcho"")",male,,0,0,349215,7.8958,,S
102,103,0,1,"White, Mr. Richard Frasar",male,21.0,0,1,35281,77.2875,D26,S
103,104,0,3,"Johansson, Mr. Gustaf Joel",male,33.0,0,0,7540,8.6542,,S
104,105,0,3,"Gustafsson, Mr. Anders Vilhelm",male,37.0,2,0,3101276,7.9250,,S
105,106,0,3,"Mionoff, Mr. Stoytcho",male,28.0,0,0,349207,7.8958,,S
106,107,1,3,"Salkjelsvik, Miss. Anna Kristine",female,21.0,0,0,343120,7.6500,,S
107,108,1,3,"Moss, Mr. Albert Johan",male,,0,0,312991,7.7750,,S
108,109,0,3,"Rekic, Mr. Tido",male,38.0,0,0,349249,7.8958,,S
109,110,1,3,"Moran, Miss. Bertha",female,,1,0,371110,24.1500,,Q


In [138]:
# Check gender distribution on the ship, add a label "count" for the result
df.groupby('Sex').size().reset_index(name='count')

Unnamed: 0,Sex,count
0,female,314
1,male,579


## Hypothesis - people residing in 1st class cabin had higher survival chance

In [139]:
# We want to first check for duplicates

# Let's look at our duplicate values
df[df.duplicated(['Name']) == True]

# Drop duplicate rows without creating a copy of our DataFrame (inplace=True)
df = df.drop_duplicates('Name')

# Check if any duplicates are left
df[df.duplicated(['Name']) == True]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked


In [140]:
# For missing data relevant columns for the current task seem to be: Survived, Pclass, Age
filter_missing = df['Survived'].isnull() | df['Pclass'].isnull() | df['Age'].isnull()

df[filter_missing]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q
29,30,0,3,"Todoroff, Mr. Lalio",male,,0,0,349216,7.8958,,S
31,32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,,1,0,PC 17569,146.5208,B78,C
32,33,1,3,"Glynn, Miss. Mary Agatha",female,,0,0,335677,7.7500,,Q
36,37,1,3,"Mamee, Mr. Hanna",male,,0,0,2677,7.2292,,C
42,43,0,3,"Kraeff, Mr. Theodor",male,,0,0,349253,7.8958,,C


In [141]:
# It seems that the only missing data is for Age
# We will create another DataFrame where these rows are removed to do age based computations
df_age = df[~df['Age'].isnull()]

df_age

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7000,G6,S


In [142]:
# Oh wait? Age is a float, we really don't like that for visibility
# Tip: it's a common occurence when you are dealing with missing values, values in the column are not the same type.

# Let's reset field type to int
df_age['Age'] = df_age['Age'].astype('int64')

df_age

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.0500,,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2,3,1,349909,21.0750,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14,1,0,237736,30.0708,,C
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4,1,1,PP 9549,16.7000,G6,S


In [143]:
# Calculate average age of passengers, output it with 2 decimal places
round(float(df_age['Age'].mean()), 2)

29.68

In [151]:
# Remember to use df again, we don't need to filter missing Age data for the following tasks
filtered_3rd_class_survivors = (df['Survived'] == 1) & (df['Pclass'] == 3)

len(df[filtered_3rd_class_survivors])

119

In [186]:
survivors_by_class = df[df['Survived'] == 1].groupby('Pclass').count()['Survived']

In [187]:
total_passengers_by_class = df.groupby('Pclass').count()['Survived']

In [188]:
round(survivors_by_class / total_passengers_by_class * 100, 2)

Pclass
1    62.96
2    47.28
3    24.24
Name: Survived, dtype: float64

## Our hypothesis has been confirmed with the previous calculation

In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 893 entries, 0 to 892
Data columns (total 12 columns):
PassengerId    893 non-null int64
Survived       893 non-null int64
Pclass         893 non-null int64
Name           893 non-null object
Sex            893 non-null object
Age            716 non-null float64
SibSp          893 non-null int64
Parch          893 non-null int64
Ticket         893 non-null object
Fare           893 non-null float64
Cabin          204 non-null object
Embarked       891 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.8+ KB
