In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("/Users/joshndirangu/Documents/titanic-eda/data/titanic.csv")

In [3]:
# Checking data in first few rows
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# Checking data in random rows 
df.sample(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
783,784,0,3,"Johnston, Mr. Andrew G",male,,1,2,W./C. 6607,23.45,,S
814,815,0,3,"Tomlin, Mr. Ernest Portage",male,30.5,0,0,364499,8.05,,S
148,149,0,2,"Navratil, Mr. Michel (""Louis M Hoffman"")",male,36.5,0,2,230080,26.0,F2,S
284,285,0,1,"Smith, Mr. Richard William",male,,0,0,113056,26.0,A19,S
338,339,1,3,"Dahl, Mr. Karl Edwart",male,45.0,0,0,7598,8.05,,S
44,45,1,3,"Devaney, Miss. Margaret Delia",female,19.0,0,0,330958,7.8792,,Q
156,157,1,3,"Gilnagh, Miss. Katherine ""Katie""",female,16.0,0,0,35851,7.7333,,Q
703,704,0,3,"Gallagher, Mr. Martin",male,25.0,0,0,36864,7.7417,,Q
612,613,1,3,"Murphy, Miss. Margaret Jane",female,,1,0,367230,15.5,,Q
384,385,0,3,"Plotcharsky, Mr. Vasil",male,,0,0,349227,7.8958,,S


In [5]:
# Cheecking structure of data in various columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
# Checking summary statistics in the dataset
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
# Checking for null values
# df.isnull() # returns boolean values for each entry
df.isnull().sum() # returns sum of null values for each column

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

DATA CLEANING

In [8]:
# Fill missing values on Age with median(since it's robust to outliers
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Age'].sample(10)

475    28.0
653    28.0
688    18.0
580    25.0
222    51.0
741    36.0
43      3.0
110    47.0
352    15.0
666    25.0
Name: Age, dtype: float64

In [9]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [10]:
# Drop Cabin column (too many missing values)
df.drop(columns=['Cabin'], inplace=True, errors = 'ignore') # ignores error if cabin column is not identified

In [11]:
# To check the remaining columns
df.columns.tolist()

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Embarked']

In [12]:
df['Embarked'].mode() # Returns a series (mode located at index [0])

0    S
Name: Embarked, dtype: object

In [13]:
# Changing Columns to appropriate data types 
df['Sex']= df['Sex'].astype('category')
df['Embarked'] = df['Embarked'].astype('category')

In [14]:
# Filling Embarked missing rows with most frequent ports
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace = True)
df.sample(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
793,794,0,1,"Hoyt, Mr. William Fisher",male,28.0,0,0,PC 17600,30.6958,C
284,285,0,1,"Smith, Mr. Richard William",male,28.0,0,0,113056,26.0,S
335,336,0,3,"Denkoff, Mr. Mitto",male,28.0,0,0,349225,7.8958,S
721,722,0,3,"Jensen, Mr. Svend Lauritz",male,17.0,1,0,350048,7.0542,S
309,310,1,1,"Francatelli, Miss. Laura Mabel",female,30.0,0,0,PC 17485,56.9292,C
590,591,0,3,"Rintamaki, Mr. Matti",male,35.0,0,0,STON/O 2. 3101273,7.125,S
622,623,1,3,"Nakid, Mr. Sahid",male,20.0,1,1,2653,15.7417,C
494,495,0,3,"Stanley, Mr. Edward Roland",male,21.0,0,0,A/4 45380,8.05,S
214,215,0,3,"Kiernan, Mr. Philip",male,28.0,1,0,367229,7.75,Q
246,247,0,3,"Lindahl, Miss. Agda Thorilda Viktoria",female,25.0,0,0,347071,7.775,S


In [15]:
# Dropping unnecessary columns 
df.drop(columns=['Ticket'], inplace=True, errors = 'ignore')

In [16]:
# Dropping duplicated rows that could skew the analysis
df.drop_duplicates(inplace = True)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   PassengerId  891 non-null    int64   
 1   Survived     891 non-null    int64   
 2   Pclass       891 non-null    int64   
 3   Name         891 non-null    object  
 4   Sex          891 non-null    category
 5   Age          891 non-null    float64 
 6   SibSp        891 non-null    int64   
 7   Parch        891 non-null    int64   
 8   Fare         891 non-null    float64 
 9   Embarked     891 non-null    category
dtypes: category(2), float64(2), int64(5), object(1)
memory usage: 57.8+ KB


In [18]:
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

DATA CLEANED SUCCESSFULLY

In [19]:
# Loading the cleaned up data into a  new file

df.to_csv('/Users/joshndirangu/Documents/titanic-eda/data/cleaned_titanic.csv', index=False)