# Analysis Titanic Desaster

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Overview

In [None]:
train_data = pd.read_csv('../data/train.csv')
test_data = pd.read_csv('../data/test.csv')

In [None]:
train_data.info()

In [None]:
train_data.head()

In [None]:
train_data.describe()

## Survival Rate by Passenger Class

In [None]:
sns.barplot(x='Pclass', y='Survived', data=train_data, ci=None)
plt.title('Survival Rate by Passenger Class')
plt.ylabel('Survival Rate')
plt.xlabel('Passenger Class')
plt.show()

## Survival Rate by Sex

In [None]:
sns.barplot(x='Sex', y='Survived', data=train_data, ci=None)
plt.title('Survival Rate by Sex')
plt.ylabel('Survival Rate')
plt.xlabel('Sex')
plt.show()

## Survival Rate by Number of Siblings/Spouses (SibSp)

In [None]:
sns.barplot(x='SibSp', y='Survived', data=train_data, ci=None)
plt.title('Survival Rate by Number of Siblings/Spouses')
plt.ylabel('Survival Rate')
plt.xlabel('SibSp')
plt.show()

## Survival Rate by Number of Parents/Children (Parch)

In [None]:
sns.barplot(x='Parch', y='Survived', data=train_data, ci=None)
plt.title('Survival Rate by Number of Parents/Children')
plt.ylabel('Survival Rate')
plt.xlabel('Parch')
plt.show()

## Survival Rate by Place of Embarkation

In [None]:
sns.barplot(x='Embarked', y='Survived', data=train_data, ci=None)
plt.title('Survival Rate by Place of Embarkation')
plt.ylabel('Survival Rate')
plt.xlabel('Place of Embarkation')
plt.show()

## Smoothed Survival Rate by Age

In [None]:
age_survival = train_data.groupby('Age')['Survived'].mean()
age_survival_smooth = age_survival.rolling(window=5, min_periods=1, center=True).mean()

plt.figure(figsize=(10, 5))
plt.plot(age_survival.index, age_survival_smooth, color='steelblue', linewidth=2)
plt.title('Smoothed Survival Rate by Age')
plt.xlabel('Age')
plt.ylabel('Survival Rate')
plt.ylim(0, 1)
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

## Smoothed Survival Rate by Fare

In [None]:
fare_survival = train_data.groupby('Fare')['Survived'].mean()
fare_survival_smooth = fare_survival.rolling(window=15, min_periods=1, center=True).mean()

plt.figure(figsize=(10, 5))
plt.plot(fare_survival.index, fare_survival_smooth, color='darkorange', linewidth=2)
plt.title('Smoothed Survival Rate by Fare')
plt.xlabel('Fare')
plt.ylabel('Survival Rate')
plt.ylim(0, 1)
plt.xlim(0, 300)  # Zoom in on x-axis from 0 to 300
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

## Survival Rate by Cabin

In [None]:
cabin_data = train_data.dropna(subset=['Cabin']).copy()
cabin_data['Cabin'] = cabin_data['Cabin'].apply(lambda x: x.split()[0])

# Find the top 5 most frequent cabins
top5_cabins = cabin_data['Cabin'].value_counts().head(5).index

# Calculate survival rate for each of these cabins
top5_cabin_survival = cabin_data[cabin_data['Cabin'].isin(top5_cabins)].groupby('Cabin')['Survived'].mean().sort_values(ascending=False)

print("Top 5 cabins and their survival rates:")
print(top5_cabin_survival)