In [91]:
## Essential Imports
import numpy as np
import pandas as pd

In [92]:
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [93]:
df.shape

(891, 12)

In [94]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


#### observation : Few null values are found in the data

In [95]:
df.size

10692

In [96]:
numColumns = list( df.select_dtypes('number').columns.values )
print(numColumns)

categoryColumns = list( df.select_dtypes('object').columns.values )
print(categoryColumns)


['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']


In [97]:
df.describe()  
# df(numColumns).describe()  # By default it describes the numerical columns. 

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [98]:
df[categoryColumns].describe()

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Braund, Mr. Owen Harris",male,347082,B96 B98,S
freq,1,577,7,4,644


# Data Cleanup

Questions to be asked [VERY  IMPORTANT]: 
- What is the problem statement ? Example: What factors influenced survival of the passengers. 
- Do we need all the variables ?  Are these variables relevant to our analysis ? 
- Even if we need a variable, is there sufficient data points for it to be relevant. 
- Do we need to transform any variables ... convert to int, combine strings, extract piece of data, convert to 0/1 etc



### Cleaning up NaN values
- Age: [mean]
- Cabin: ['U'] - Most cabin info is unknown. Also, it does not make sense to assign a mode value (same cabin number to 700 passengers).
- Embarked:[mode] 

In [99]:
# Age clean up. 

# First find mean of all known ages. 
meanAge = df.Age.mean() 
print(meanAge)

# Replace all null "Age" values with mean
df.Age.fillna(value=meanAge, inplace=True)
df[pd.isnull(df.Age)]  # This should not give any rows since all df.Age has been assigned some value. 

29.69911764705882


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked


In [100]:
# Cabin Clean up
del df['Cabin']
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C


In [101]:
# Embarked Clean up

# Check for Embarked rows with Null (NaN) values. 
df.loc[ df.Embarked.isnull() ]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,


In [105]:
# Calculate Mode
EmbarkMode = df.Embarked.mode()

# Replace null values with mode ...
# NOTE: isnull() is better than "... == np.NaN" comparisons
# df.loc[ condition-to-select-rows,  column-name-to-replace] = replacement-value
df.loc[ df.Embarked.isnull(), "Embarked" ] = str(EmbarkMode)

# Check if any null values in Embarked column remain ?
df.loc[ df.Embarked.isnull() ]



Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked


In [107]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


#### Passenger ID & Name - Do we need this ? 

In [116]:
# How many unique values are there for passenger id. 
len(df.PassengerId.unique())

891

Since every passenger id is unique, it does not give us any information that can be used to catgorize survivors. 
Useless field. Delete it. 

Similarly, the "name" field also is unique just like passenger id ... gives no information on survival rate. So we delete this also. 

In [119]:
del df['PassengerId']
del df['Name']


#### Ticket number ? Do we need this ? 

In [121]:
len(df.Ticket.unique())

681

There are 681 unique values out of 891 total.  Approx 200 tickets have same numbers ... which might be result of families traveling on same ticket #.  Does not seem to add any value to analysis but we will keep it around for now. 


In [122]:
del df['Ticket']

In [123]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S
