In [206]:
# Importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer

In [207]:
# Loading the dataset
ds = pd.read_csv(r'C:\Users\jayak\OneDrive\Desktop\Data Analytics\titanic.csv')

In [208]:
ds.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [209]:
# checking how many missing values are there in the dataset
ds.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [210]:
# Replace the missing values in the Age column with the mean value.
new_ds = ds
new_ds['Age'] = new_ds['Age'].fillna(ds['Age'].mean())

In [211]:
new_ds.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [212]:
# Creating new dataset with only the columns 'Cabin' and 'Embarked'
new_ds1 = ['Cabin','Embarked']
col = ds[new_ds1]

In [213]:
col.info

<bound method DataFrame.info of     Cabin Embarked
0     NaN        S
1     C85        C
2     NaN        S
3    C123        S
4     NaN        S
..    ...      ...
886   NaN        S
887   B42        S
888   NaN        S
889  C148        C
890   NaN        Q

[891 rows x 2 columns]>

In [214]:
# checking how many missing values are there in these specific columns
col.isnull().sum()

Cabin       687
Embarked      2
dtype: int64

In [215]:
# Using SimpleImputer to replace the missing values in the 'Cabin' column with the most frequent value
# This is because we cant use mean or median as the column is of object type
imputer = SimpleImputer(strategy='most_frequent')
new_col = ds
new_col = pd.DataFrame(imputer.fit_transform(col))

In [216]:
new_col.isnull().sum()

0    0
1    0
dtype: int64

In [217]:
new_col.info

<bound method DataFrame.info of            0  1
0    B96 B98  S
1        C85  C
2    B96 B98  S
3       C123  S
4    B96 B98  S
..       ... ..
886  B96 B98  S
887      B42  S
888  B96 B98  S
889     C148  C
890  B96 B98  Q

[891 rows x 2 columns]>

In [218]:
# Replacing the missing value colums with filled values columns
ds['Cabin']= new_col[0]
ds['Embarked']= new_col[1]

In [219]:
ds.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,B96 B98,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,B96 B98,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,B96 B98,S
5,6,0,3,"Moran, Mr. James",male,29.699118,0,0,330877,8.4583,B96 B98,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,B96 B98,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,B96 B98,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,B96 B98,C


In [220]:
# using label encoder to convert the categorical values to numerical values
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit_transform(['male','female'])

array([1, 0], dtype=int64)

In [221]:
# Assigning the numerical
ds['Sex']= ds['Sex'].map({'male':1,
                               'female':0})

In [222]:
ds.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,B96 B98,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,B96 B98,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,B96 B98,S
5,6,0,3,"Moran, Mr. James",1,29.699118,0,0,330877,8.4583,B96 B98,Q
6,7,0,1,"McCarthy, Mr. Timothy J",1,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",1,2.0,3,1,349909,21.075,B96 B98,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",0,27.0,0,2,347742,11.1333,B96 B98,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",0,14.0,1,0,237736,30.0708,B96 B98,C


In [223]:
# checking the unique values in the 'Cabin' column
ds['Cabin'].value_counts()

Cabin
B96 B98        691
G6               4
C23 C25 C27      4
C22 C26          3
F33              3
              ... 
E34              1
C7               1
C54              1
E36              1
C148             1
Name: count, Length: 147, dtype: int64

In [224]:
# Using LabelEncoder to convert the 'Cabin' column to numerical values
ds['Encoded_Cabin'] = le.fit_transform(ds['Cabin'])

In [225]:
# Using LabelEncoder to convert the 'Embarked' column to numerical values
ds['Encoded_Embarked'] = le.fit_transform(ds['Embarked'])

In [226]:
# Not a necessary step, but just converting the 'Age' column to integer type
ds['Age']= ds['Age'].astype(int)

In [227]:
ds.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Encoded_Cabin,Encoded_Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22,1,0,A/5 21171,7.25,B96 B98,S,47,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38,1,0,PC 17599,71.2833,C85,C,81,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26,0,0,STON/O2. 3101282,7.925,B96 B98,S,47,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35,1,0,113803,53.1,C123,S,55,2
4,5,0,3,"Allen, Mr. William Henry",1,35,0,0,373450,8.05,B96 B98,S,47,2
5,6,0,3,"Moran, Mr. James",1,29,0,0,330877,8.4583,B96 B98,Q,47,1
6,7,0,1,"McCarthy, Mr. Timothy J",1,54,0,0,17463,51.8625,E46,S,129,2
7,8,0,3,"Palsson, Master. Gosta Leonard",1,2,3,1,349909,21.075,B96 B98,S,47,2
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",0,27,0,2,347742,11.1333,B96 B98,S,47,2
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",0,14,1,0,237736,30.0708,B96 B98,C,47,0


In [228]:
# Checking the datatypes of the columns
ds.dtypes

PassengerId           int64
Survived              int64
Pclass                int64
Name                 object
Sex                   int64
Age                   int32
SibSp                 int64
Parch                 int64
Ticket               object
Fare                float64
Cabin                object
Embarked             object
Encoded_Cabin         int32
Encoded_Embarked      int32
dtype: object

In [229]:
# Using label encoder to convert the 'Ticket' column to numerical values
ds['Encoded_Ticket']= le.fit_transform(ds['Ticket'])

In [230]:
# Converting the 'Fare' column to integer type to make all coumns of same datatype
ds['Fare'] = ds['Fare'].astype(int)

In [231]:
ds.drop(['Ticket'], axis=1, inplace=True)

In [232]:
ds.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Encoded_Cabin,Encoded_Embarked,Encoded_Ticket
0,1,0,3,"Braund, Mr. Owen Harris",1,22,1,0,7,B96 B98,S,47,2,523
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38,1,0,71,C85,C,81,0,596
2,3,1,3,"Heikkinen, Miss. Laina",0,26,0,0,7,B96 B98,S,47,2,669
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35,1,0,53,C123,S,55,2,49
4,5,0,3,"Allen, Mr. William Henry",1,35,0,0,8,B96 B98,S,47,2,472
5,6,0,3,"Moran, Mr. James",1,29,0,0,8,B96 B98,Q,47,1,275
6,7,0,1,"McCarthy, Mr. Timothy J",1,54,0,0,51,E46,S,129,2,85
7,8,0,3,"Palsson, Master. Gosta Leonard",1,2,3,1,21,B96 B98,S,47,2,395
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",0,27,0,2,11,B96 B98,S,47,2,344
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",0,14,1,0,30,B96 B98,C,47,0,132


In [235]:
rand_ds = ds.drop(['Name','Cabin','Embarked'], axis=1)

In [236]:
# This will be the dataset with all data types converted to integer
rand_ds.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Encoded_Cabin,Encoded_Embarked,Encoded_Ticket
0,1,0,3,1,22,1,0,7,47,2,523
1,2,1,1,0,38,1,0,71,81,0,596
2,3,1,3,0,26,0,0,7,47,2,669
3,4,1,1,0,35,1,0,53,55,2,49
4,5,0,3,1,35,0,0,8,47,2,472
5,6,0,3,1,29,0,0,8,47,1,275
6,7,0,1,1,54,0,0,51,129,2,85
7,8,0,3,1,2,3,1,21,47,2,395
8,9,1,3,0,27,0,2,11,47,2,344
9,10,1,2,0,14,1,0,30,47,0,132


In [237]:
# We cant use the MinMaxScaler on the dataset as it contains string values
# We can use MinMaxScaler on a specific integer column. Just for the sake i converted all columns to integer so that i can use MinMaxScaler entire dataset
scaler = MinMaxScaler()
scaler.fit_transform(rand_ds)

array([[0.        , 0.        , 1.        , ..., 0.32191781, 1.        ,
        0.76911765],
       [0.0011236 , 1.        , 0.        , ..., 0.55479452, 0.        ,
        0.87647059],
       [0.00224719, 1.        , 1.        , ..., 0.32191781, 1.        ,
        0.98382353],
       ...,
       [0.99775281, 0.        , 1.        , ..., 0.32191781, 1.        ,
        0.99264706],
       [0.9988764 , 1.        , 0.        , ..., 0.4109589 , 0.        ,
        0.01176471],
       [1.        , 0.        , 1.        , ..., 0.32191781, 0.5       ,
        0.68529412]])

In [238]:
# This is similar to MinMaxScaler. We cant use StandardScaler on the dataset as it contains string values.
scalar = StandardScaler()
scalar.fit_transform(rand_ds)

array([[-1.73010796, -0.78927234,  0.82737724, ..., -0.28188124,
         0.58595414,  0.91896631],
       [-1.72622007,  1.2669898 , -1.56610693, ...,  1.16154512,
        -1.9423032 ,  1.28262456],
       [-1.72233219,  1.2669898 ,  0.82737724, ..., -0.28188124,
         0.58595414,  1.64628282],
       ...,
       [ 1.72233219, -0.78927234,  0.82737724, ..., -0.28188124,
         0.58595414,  1.67617254],
       [ 1.72622007,  1.2669898 , -1.56610693, ...,  0.27001707,
        -1.9423032 , -1.64656796],
       [ 1.73010796, -0.78927234,  0.82737724, ..., -0.28188124,
        -0.67817453,  0.63501397]])