In [1]:
# Python Library Imports
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Reading Labelled Data:

In [2]:
data = pd.read_csv('/kaggle/input/titanic/train.csv')

# Examining Data:

In [3]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
data.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [5]:
# Checking the number of rows (observations) and columns (attributes) in the data
print(data.shape)

# Checking if the data has any repeated or duplicate rows or observation samples
print(data.duplicated().any())

(891, 12)
False


**The data contains 891 observations or unique samples. The 12 columns correspond to the 12 attributes of the data. The Survived column represents the target variable, the variable whose value is to be predicted. The model needs to be trained to learn from the data to predict value of Survival for any new observation.**

In [6]:
data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

**The data contains 11 fields, excluding the target variable "Survived". The data has 5 categorical features and 6 numerical features.**

**The Categorical Features are:**
1. Name
2. Sex
3. Ticket
4. Cabin
5. Embarked

**The Numerical Features are:**
1. PassengerId
2. Pclass
3. Age
4. SibSp
5. Parch
6. Fare

In [7]:
# Checking the unique values in PassengerId and Name, 
# as these would be usually be expected to have different values for each observation.
print(f"PassengerId attribute unique values and their counts in the data. {data['PassengerId'].nunique()}")
print(f"Name attribute unique values and their counts in the data. {data['Name'].nunique()}")

PassengerId attribute unique values and their counts in the data. 891
Name attribute unique values and their counts in the data. 891


**As can be seen from above output both PassengerId and Name have no duplicates, each entry for these attributes is unique. Thus, these fields can be removed from the data as they would not add any value to the model learning.**

In [8]:
# SibSp represents the Number of accompanying sibling or spouses of the passenger.
# print("SibSp Values and Counts:\n",data['SibSp'].value_counts())

# Parch represents the Number of accompanying children or parents of the passenger.
# print("Parch Values and Counts:\n",data['Parch'].value_counts())

for index, feature in enumerate(['SibSp', 'Parch']):
    print("\n", feature)
    feature_stat = dict(data[feature].value_counts())
    values = []
    rates = []
    for feature_val in feature_stat:
        values.append(feature_val)
        value = data.loc[data[feature] == feature_val]["Survived"]
        rates.append(sum(value) / len(value))
    
    print(pd.DataFrame(sorted(zip(values, rates))))


 SibSp
   0         1
0  0  0.345395
1  1  0.535885
2  2  0.464286
3  3  0.250000
4  4  0.166667
5  5  0.000000
6  8  0.000000

 Parch
   0         1
0  0  0.343658
1  1  0.550847
2  2  0.500000
3  3  0.600000
4  4  0.000000
5  5  0.200000
6  6  0.000000


Both SibSP and Parch features essentially represent the number of people accompanying the passenger. As it would be expected for groups of people to stay together on the titanic or leaving on the life boats based on their familial ties, the two fields can be combined into one. The survival rate also suggest similar hypothesis, the percentage of survival of passengers with 1 or 2 companions is higher compared to those who had no companions or had way too many companions.

In [9]:
# Combining number of accompanying sibling, spouses, children and parents of passenger into one column.
data['Companions'] = data['SibSp'] + data['Parch']
# Removing the redundant columns
_data = data.drop(['SibSp', 'Parch'], axis = 1)

# Next: Discretize Fare