In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Load the dataset
df = pd.read_csv('../data/train.csv')

In [5]:
# Basic info about our dataset
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

Dataset shape: (891, 12)
Columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [9]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [11]:
# Get info about data types and missing values
print("Dataset info:")
df.info()

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [17]:
# Look at our target variable (what we want to predict)
print("Survival distribution:")
print(df['Survived'].value_counts())
print(f"\nSurvival rate: {df['Survived'].mean():.2%}")



Survival distribution:
Survived
0    549
1    342
Name: count, dtype: int64

Survival rate: 38.38%


In [31]:
print("Passenger class distribution:")
pclass_counts = df['Pclass'].value_counts().sort_index()
pclass_props = df['Pclass'].value_counts(normalize=True).sort_index()

for pclass in [1, 2, 3]:
    count = pclass_counts[pclass]
    prop = pclass_props[pclass]
    print(f"Class {pclass}: {count} passengers ({prop:.1%})")

Passenger class distribution:
Class 1: 216 passengers (24.2%)
Class 2: 184 passengers (20.7%)
Class 3: 491 passengers (55.1%)


In [33]:
print("\nGender distribution:")
gender_counts = df['Sex'].value_counts()
gender_props = df['Sex'].value_counts(normalize=True)

for gender in ['male', 'female']:
    count = gender_counts[gender]
    prop = gender_props[gender]
    print(f"{gender.capitalize()}: {count} passengers ({prop:.1%})")


Gender distribution:
Male: 577 passengers (64.8%)
Female: 314 passengers (35.2%)


In [36]:
# Survival rate by passenger class
print("Survival rate by class:")
survival_by_class = df.groupby('Pclass')['Survived'].agg(['count', 'sum', 'mean'])
survival_by_class.columns = ['Total', 'Survived', 'Survival_Rate']
survival_by_class['Survival_Rate'] = survival_by_class['Survival_Rate'].round(3)
print(survival_by_class)

Survival rate by class:
        Total  Survived  Survival_Rate
Pclass                                
1         216       136          0.630
2         184        87          0.473
3         491       119          0.242


In [38]:
# Survival rate by gender
print("Survival rate by gender:")
survival_by_gender = df.groupby('Sex')['Survived'].agg(['count', 'sum', 'mean'])
survival_by_gender.columns = ['Total', 'Survived', 'Survival_Rate']
survival_by_gender['Survival_Rate'] = survival_by_gender['Survival_Rate'].round(3)
print(survival_by_gender)

Survival rate by gender:
        Total  Survived  Survival_Rate
Sex                                   
female    314       233          0.742
male      577       109          0.189


In [42]:
"""
KEY INSIGHTS FROM EXPLORATION:
- Overall survival rate: 38.4%
- Class 1: 63% survival, Class 2: 47%, Class 3: 24%
- Female: 74% survival, Male: 19% survival
- Missing data: Age (177), Cabin (687), Embarked (2)
"""
print("Exploration complete! Key patterns identified.")

Exploration complete! Key patterns identified.
