# TITANIC DATA SET - EDA PROCESS

In [14]:
import kagglehub
import os
import pandas as pd
import numpy as np

path = kagglehub.dataset_download("yasserh/titanic-dataset")
print("Path to Data set  files : ",path)

print(os.listdir(path))
csv_file_path = os.path.join(path,'Titanic-Dataset.csv')

data = pd.read_csv(csv_file_path)

print(data.head())

Path to Data set  files :  C:\Users\mdurg\.cache\kagglehub\datasets\yasserh\titanic-dataset\versions\1
['Titanic-Dataset.csv']
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.925

In [15]:
print(data.dtypes)
print(data.select_dtypes(include=['object']).head())

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object
                                                Name     Sex  \
0                            Braund, Mr. Owen Harris    male   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female   
2                             Heikkinen, Miss. Laina  female   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female   
4                           Allen, Mr. William Henry    male   

             Ticket Cabin Embarked  
0         A/5 21171   NaN        S  
1          PC 17599   C85        C  
2  STON/O2. 3101282   NaN        S  
3            113803  C123        S  
4            373450   NaN        S  


## DESCRIBING THE DATA

In [16]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


## LABEL ENCODING

In [17]:
data['Sex'] = data['Sex'].astype(str).str.strip().str.lower()
data['Sex'] = data['Sex'].map({'male':0, 'female':1})
print(data['Sex'].head())

0    0
1    1
2    1
3    1
4    0
Name: Sex, dtype: int64


## ONE HOT ENCODING

In [18]:
data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])
print(data['Embarked'].unique())

['S' 'C' 'Q']


In [19]:
data = pd.get_dummies(data, columns = ['Embarked'], prefix='Embarked')
print(data.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name  Sex   Age  SibSp  Parch  \
0                            Braund, Mr. Owen Harris    0  22.0      1      0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    1  38.0      1      0   
2                             Heikkinen, Miss. Laina    1  26.0      0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)    1  35.0      1      0   
4                           Allen, Mr. William Henry    0  35.0      0      0   

             Ticket     Fare Cabin  Embarked_C  Embarked_Q  Embarked_S  
0         A/5 21171   7.2500   NaN       False       False        True  
1          PC 17599  71.2833   C85        True       False       False  
2  STON/O2. 3101282   7.9250   NaN       False       False        True  
3   

In [20]:
data['Cabin'] = data['Cabin'].fillna('Unknown')
print(data['Cabin'].unique()[:5])

['Unknown' 'C85' 'C123' 'E46' 'G6']


In [21]:
data['Deck'] = data['Cabin'].apply(lambda x: x[0] if x != 'Uknown' else 'U')
print(data['Deck'].head())

0    U
1    C
2    U
3    C
4    U
Name: Deck, dtype: object


## GROUP BY TICKETS

In [23]:
ticket_count = data['Ticket'].value_counts()
print(ticket_count)

Ticket
347082      7
CA. 2343    7
1601        7
3101295     6
CA 2144     6
           ..
9234        1
19988       1
2693        1
PC 17612    1
370376      1
Name: count, Length: 681, dtype: int64


In [25]:
data['TicketGroupSize'] = data['Ticket'].map(ticket_count)
print(data[['Ticket', 'TicketGroupSize']].head(10))

             Ticket  TicketGroupSize
0         A/5 21171                1
1          PC 17599                1
2  STON/O2. 3101282                1
3            113803                2
4            373450                1
5            330877                1
6             17463                1
7            349909                4
8            347742                3
9            237736                2


In [26]:
print(data['TicketGroupSize'].unique())

[1 2 4 3 7 5 6]


In [28]:
def group_type(size):
  if size == 1:
    return 'sINGLE'
  elif size == 2:
    return 'LOVERS'
  else:
    return 'GANG'

data['GroupType'] = data['TicketGroupSize'].apply(group_type)
print(data[['Ticket', 'TicketGroupSize', 'GroupType']].head())

             Ticket  TicketGroupSize GroupType
0         A/5 21171                1    sINGLE
1          PC 17599                1    sINGLE
2  STON/O2. 3101282                1    sINGLE
3            113803                2    LOVERS
4            373450                1    sINGLE


## STANDARD SCALER

In [29]:

from sklearn.preprocessing import StandardScaler

scale_col = ['Age', 'Fare', 'TicketGroupSize']

scaler = StandardScaler()

data[scale_col] = scaler.fit_transform(data[scale_col])

print("Scaler Features")

print(data[scale_col])

Scaler Features
          Age      Fare  TicketGroupSize
0   -0.530377 -0.502445        -0.579162
1    0.571831  0.786845        -0.579162
2   -0.254825 -0.488854        -0.579162
3    0.365167  0.420730         0.155928
4    0.365167 -0.486337        -0.579162
..        ...       ...              ...
886 -0.185937 -0.386671        -0.579162
887 -0.737041 -0.044381        -0.579162
888       NaN -0.176263         0.155928
889 -0.254825 -0.044381        -0.579162
890  0.158503 -0.492378        -0.579162

[891 rows x 3 columns]


## STATASTICAL METHODS

In [30]:
print('Mean:',data[scale_col].mean())

print('Standard deviation:', data[scale_col].std())

Mean: Age                2.388379e-16
Fare               3.987333e-18
TicketGroupSize    6.180366e-17
dtype: float64
Standard deviation: Age                1.000701
Fare               1.000562
TicketGroupSize    1.000562
dtype: float64
