# **Initialize**

In [2]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn import preprocessing
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

%matplotlib inline

**Initialize the data to use**

**Please note that you will need to change the path to the file location**

In [3]:
df = pd.read_csv(r"titanic_data.csv")

In [4]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


**We'll see how big is the data**

In [5]:
df.shape

(891, 12)

**It tells us the data have 891 row**

# **Describing the data**

**With only a function you can know mean, standard deviation,min max, and all sort of things about the data!**

In [6]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


**You can use these functions to know how many unique values does the data have**

In [7]:
df.nunique()

PassengerId    891
Survived         2
Pclass           3
Name           891
Sex              2
Age             88
SibSp            7
Parch            7
Ticket         681
Fare           248
Cabin          147
Embarked         3
dtype: int64

In [8]:
df["Survived"].unique()

array([0, 1], dtype=int64)

In [9]:
df["Survived"].value_counts()

0    549
1    342
Name: Survived, dtype: int64

**Check null values**

In [10]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

# **Working with null values**

**We already know that we need to fill the null values before going further**

**We'll do two operations to fill out the null values by using mean and mode**

### **Mean**

In [11]:
mean_age = df["Age"].mean()
mean_age

29.69911764705882

In [12]:
std_age = df["Age"].std()
std_age

14.526497332334044

In [13]:
is_null_age = df["Age"].isnull().sum()
is_null_age

177

In [14]:
rand_age = np.random.randint(mean_age - std_age, mean_age + std_age, size = is_null_age)
rand_age

array([25, 17, 42, 39, 16, 41, 34, 41, 31, 24, 36, 30, 20, 29, 38, 23, 16,
       34, 36, 32, 19, 19, 33, 22, 35, 16, 42, 34, 40, 19, 24, 34, 37, 15,
       27, 25, 24, 32, 28, 32, 33, 31, 15, 35, 18, 19, 43, 37, 34, 35, 18,
       30, 25, 38, 38, 33, 31, 20, 17, 35, 27, 43, 40, 31, 29, 34, 28, 28,
       22, 36, 27, 36, 21, 18, 42, 32, 23, 19, 26, 40, 35, 18, 27, 16, 17,
       15, 34, 26, 36, 21, 28, 18, 30, 30, 24, 32, 38, 27, 34, 19, 32, 35,
       35, 43, 36, 22, 40, 15, 31, 23, 24, 42, 27, 34, 41, 36, 23, 34, 22,
       42, 34, 30, 30, 27, 23, 25, 35, 23, 27, 39, 33, 32, 29, 41, 16, 36,
       33, 28, 25, 15, 31, 29, 33, 25, 41, 24, 28, 34, 18, 17, 36, 24, 25,
       41, 36, 16, 25, 26, 32, 27, 41, 18, 27, 38, 42, 36, 32, 26, 25, 30,
       37, 38, 20, 26, 21, 32, 29])

In [15]:
age_slice = df.Age.copy()
age_slice[np.isnan(age_slice)] = rand_age
df.Age = age_slice
df.Age = df.Age.astype(int)

df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


## **Mode**

In [16]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [17]:
modus = df.Embarked.mode()
modus

0    S
dtype: object

In [18]:
df.Embarked = df.Embarked.fillna(modus.values[0], inplace = False)
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [19]:
deck = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "T": 8, "U": 9}

df["Cabin"] = df["Cabin"].fillna("U0")
df["Deck"] = df["Cabin"].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())
df["Deck"] = df["Deck"].map(deck)
df["Deck"] = df["Deck"].fillna(0)
df["Deck"] = df["Deck"].astype(int)

In [20]:
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Deck
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,U0,S,9
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C,3
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,U0,S,9
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S,3
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,U0,S,9


In [21]:
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
Deck           0
dtype: int64