In [1]:
import pandas as pd

In [25]:
data = pd.read_csv("train.csv")

#EDA

In [26]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


In [27]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


#Data Munging

**Notice that the age count is less than the rest of the columns, which means that we are missing data** 


What should we do about this? We have a few options:
        1. List wise deletion: Delete observations where any of the variable is missing
        2. Pair Wise Deletion: Include all cases in data analysis in which the variables of interest are present
        3. Mean/Mode Substitution: Replace missing values with the mean or mode of the other values in that column


We'll use option 3 here.


In [28]:
data["Age"] = data["Age"].fillna(data["Age"].median())

**We can use describe again to verify our age count is accurate**

In [29]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.361582,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.019697,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


The numeric columns above now look ready to analyze, but what about the other remaining columns in the data? For these we need to convert them to number equivalents for our model to properly analyze them. We will not be using the ticket and name columns because those most likely will not have a impact on our model.

Let's first start with sex, we first need to see how many unique values are in the column and then come up with a numbering system to replace them.

In [30]:
data.Sex.unique()

array(['male', 'female'], dtype=object)

Now that we know the diffent sexes listed, we can replace them. It's easiest just to use 0 and 1 in this case.

In [31]:
data.loc[data["Sex"] == "male", "Sex"] = 0 #Use .loc for index search
data.loc[data["Sex"] == "female", "Sex"] = 1

In [32]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35,0,0,373450,8.05,,S


Now let's do the same thing to the embarked column.

In [33]:
#See if any of the values in the column are missing
data['Embarked'].isnull().values.any()

True

In [34]:
data["Embarked"].value_counts()

S    644
C    168
Q     77
dtype: int64

In [35]:
#Since the majority are S we'll fill in the blanks with S
data["Embarked"] = data["Embarked"].fillna("S")

In [36]:
#Double check to be safe
data['Embarked'].isnull().values.any()

False

In [37]:
data.loc[data["Embarked"] == "S", "Embarked"] = 0
data.loc[data["Embarked"] == "C", "Embarked"] = 1
data.loc[data["Embarked"] == "Q", "Embarked"] = 2