# Titanic

In [212]:
import numpy as np
import pandas as pd
import seaborn as sns

## Load Dataset Titanic and check NaN values

In [213]:
df = sns.load_dataset('Titanic')
df.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

### Drop column deck since most of the values are NaN

In [214]:
df = df.drop(columns=['deck'])

### Fill NaN values in age with median

In [215]:
median_age = df['age'].median()
df['age'] = df['age'].fillna(median_age)

### Check NaN values in embark_town

In [216]:
df[df['embark_town'].isna()]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
61,1,1,female,38.0,0,0,80.0,,First,woman,False,,yes,True
829,1,1,female,62.0,0,0,80.0,,First,woman,False,,yes,True


### Check duplicates

In [217]:
df.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
886     True
887    False
888    False
889    False
890    False
Length: 891, dtype: bool

### Drop Duplicates

In [218]:
df = df.drop_duplicates()
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,0,3,female,39.0,0,5,29.1250,Q,Third,woman,False,Queenstown,no,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,Southampton,yes,True
888,0,3,female,28.0,1,2,23.4500,S,Third,woman,False,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,Cherbourg,yes,True


### Rename columns

In [236]:
df.rename(columns={'sibsp':'Siblings and Spouses', 'parch': 'Parents or children'}, inplace=True)
df

Unnamed: 0,survived,pclass,sex,age,relatives,Parents or children,fare,embarked,class,who,adult_male,embark_town,alive,alone,fare_percentage,total_fare
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,Southampton,no,False,0.026821,27030.7623
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False,0.263712,27030.7623
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,Southampton,yes,True,0.029318,27030.7623
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,Southampton,yes,False,0.196443,27030.7623
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,Southampton,no,True,0.029781,27030.7623
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,0,3,female,39.0,0,5,29.1250,Q,Third,woman,False,Queenstown,no,False,0.107748,27030.7623
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,Southampton,yes,True,0.110985,27030.7623
888,0,3,female,28.0,1,2,23.4500,S,Third,woman,False,Southampton,no,False,0.086753,27030.7623
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,Cherbourg,yes,True,0.110985,27030.7623


### Describe to have a summary info

In [220]:
df.describe()

Unnamed: 0,survived,pclass,age,relatives,parch,fare
count,775.0,775.0,775.0,775.0,775.0,775.0
mean,0.412903,2.246452,29.581187,0.529032,0.420645,34.878403
std,0.492674,0.853574,13.766359,0.990326,0.840565,52.408474
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,1.0,21.0,0.0,0.0,8.05
50%,0.0,3.0,28.0,0.0,0.0,15.9
75%,1.0,3.0,36.0,1.0,1.0,34.1979
max,1.0,3.0,80.0,8.0,6.0,512.3292


### Average fare and age per class

In [221]:
average_fare_age_per_class = df.groupby('pclass')[['fare', 'age']].mean()
average_fare_age_per_class

Unnamed: 0_level_0,fare,age
pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,85.675199,36.921048
2,21.889279,29.791646
3,13.58885,25.651297


### New column showing the fare as a percentage of total

In [222]:
total_fare = df['fare'].sum()
df['fare_percentage'] = (df['fare']/total_fare)*100
df['total_fare'] = total_fare 
df

Unnamed: 0,survived,pclass,sex,age,relatives,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone,fare_percentage,total_fare
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,Southampton,no,False,0.026821,27030.7623
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False,0.263712,27030.7623
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,Southampton,yes,True,0.029318,27030.7623
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,Southampton,yes,False,0.196443,27030.7623
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,Southampton,no,True,0.029781,27030.7623
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,0,3,female,39.0,0,5,29.1250,Q,Third,woman,False,Queenstown,no,False,0.107748,27030.7623
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,Southampton,yes,True,0.110985,27030.7623
888,0,3,female,28.0,1,2,23.4500,S,Third,woman,False,Southampton,no,False,0.086753,27030.7623
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,Cherbourg,yes,True,0.110985,27030.7623


### How many survivors were male?

In [223]:
df[(df['survived'] == 1) & (df['sex'] == 'male')].shape[0]

104

In [224]:
((df['survived'] == 1) & (df['sex'] == 'male')).sum()

104

In [225]:
df[(df['survived'] == 1) & (df['sex'] == 'male')].count()

survived           104
pclass             104
sex                104
age                104
relatives          104
parch              104
fare               104
embarked           104
class              104
who                104
adult_male         104
embark_town        104
alive              104
alone              104
fare_percentage    104
total_fare         104
dtype: int64

### How Many Survivors were female?

In [226]:
df[(df['survived'] == 1) & (df['sex'] == 'female')].shape[0]

216

### How many survivors were female compared to male?

In [227]:
male_survivors = df[(df['survived'] == 1) & (df['sex'] == 'male')].shape[0]
female_survivors = df[(df['survived'] == 1) & (df['sex'] == 'female')].shape[0]
difference = female_survivors - male_survivors
survivors = pd.DataFrame({'Male Survivors': male_survivors,
                       'Female Survivors': female_survivors,
                       'Difference': difference},
                        index=['Counts'])
survivors

Unnamed: 0,Male Survivors,Female Survivors,Difference
Counts,104,216,112


### Survivors per class

In [228]:
df[(df['survived'] == 1) & (df['class'] == 'First')].shape[0]

133

In [229]:
df[(df['survived'] == 1) & (df['class'] == 'Second')].shape[0]

83

In [230]:
df[(df['survived'] == 1) & (df['class'] == 'Third')].shape[0]

104

In [231]:
total_survivors = df[df['survived'] == 1].shape[0] 
total_survivors

320

In [232]:
first_class_survivors = df[(df['survived'] == 1) & (df['class'] == 'First')].shape[0]
second_class_survivors = df[(df['survived'] == 1) & (df['class'] == 'Second')].shape[0]
third_class_survivors = df[(df['survived'] == 1) & (df['class'] == 'Third')].shape[0]
survivors_class = pd.DataFrame({'Firs_Class': first_class_survivors,
                                'Second_Class': second_class_survivors,
                                'Third_Class': third_class_survivors},
                               index=['Counts'])
survivors_class

Unnamed: 0,Firs_Class,Second_Class,Third_Class
Counts,133,83,104


### Survivor per pclass

In [233]:
first_class_survivors = df[(df['survived'] == 1) & (df['pclass'] == 1)].shape[0]
second_class_survivors = df[(df['survived'] == 1) & (df['pclass'] == 2)].shape[0]
third_class_survivors = df[(df['survived'] == 1) & (df['pclass'] == 3)].shape[0]
survivors_class = pd.DataFrame({'Firs_Class': first_class_survivors,
                                'Second_Class': second_class_survivors,
                                'Third_Class': third_class_survivors},
                               index=['Counts'])
survivors_class

Unnamed: 0,Firs_Class,Second_Class,Third_Class
Counts,133,83,104


### Survivors per age

In [234]:
child_survivors = df[(df['survived'] == 1) & (df['age'] < 18)].shape[0]
young_adult_survivors = df[(df['survived'] == 1) & (df['age'] >= 18) & (df['age'] <= 30)].shape[0]
mid_adult_survivors = df[(df['survived'] == 1) & (df['age'] >= 31) & (df['age'] <= 60)].shape[0]
old_adult_survivors = df[(df['survived'] == 1) & (df['age'] > 60)].shape[0]
survivors_age = pd.DataFrame({'18<': child_survivors,
                                '18 - 30': young_adult_survivors,
                                '31 - 60': mid_adult_survivors,
                                '60<': old_adult_survivors },
                               index=['Survivors per age'])
survivors_age

Unnamed: 0,18<,18 - 30,31 - 60,60<
Survivors per age,60,139,116,5


In [258]:
correlation_matrix = df[['age', 'fare', 'survived', 'pclass']].corr()
correlation_matrix

Unnamed: 0,age,fare,survived,pclass
age,1.0,0.092503,-0.078114,-0.342745
fare,0.092503,1.0,0.247159,-0.554649
survived,-0.078114,0.247159,1.0,-0.331388
pclass,-0.342745,-0.554649,-0.331388,1.0


In [254]:
survivors = df[df['survived']==1].shape[0]
total= df.shape[0]
percentage_survivors = f"{round((survivors/total) * 100, 2)}%"
percentage_s = pd.Series(percentage_survivors, index=['% Survivors'])
percentage_s

% Survivors    41.29%
dtype: object

### Survivors Alone

In [261]:
alone_survivors = df[(df['survived'] == 1) & (df['alone'])].shape[0]
not_alone_survivors = df[(df['survived'] == 1) & (df['alone']==False)].shape[0]
alone_survivors = pd.DataFrame({'Lonely Survivors': alone_survivors,
                                '