In [None]:
# Week 3: Data Visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for better looking plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("‚úÖ Week 3 Visualization Environment Ready!")

# Load your CLEANED dataset from Week 2
# If you saved the cleaned version:
df = pd.read_csv('titanic_cleaned.csv')

# Or if you need to clean again quickly:
# url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
# df = pd.read_csv(url)
# # Quick cleaning (repeat from Week 2)
# df = df.drop('Cabin', axis=1)
# df['Age'].fillna(df['Age'].median(), inplace=True)
# df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

print("Dataset loaded successfully!")
print(f"Dataset shape: {df.shape}")
print("\nColumns available:", df.columns.tolist())

In [None]:
print("=== DATASET OVERVIEW FOR VISUALIZATION ===")
print(df.info())
print("\nFirst 5 rows:")
display(df.head())

In [None]:
print("=== BAR CHART ===")
plt.figure(figsize=(10, 6))

# Example: Survival count by passenger class
survival_by_class = df.groupby('Pclass')['Survived'].mean() * 100

plt.bar(survival_by_class.index, survival_by_class.values, 
        color=['skyblue', 'lightcoral', 'lightgreen'])
plt.title('Survival Rate by Passenger Class', fontsize=14, fontweight='bold')
plt.xlabel('Passenger Class (1st, 2nd, 3rd)')
plt.ylabel('Survival Rate (%)')
plt.xticks([1, 2, 3], ['1st Class', '2nd Class', '3rd Class'])
plt.grid(axis='y', alpha=0.3)

# Add value labels on bars
for i, v in enumerate(survival_by_class.values):
    plt.text(i + 1, v + 1, f'{v:.1f}%', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

print("üìä Bar Chart Created: Survival Rate by Passenger Class")

In [None]:
print("=== SCATTER PLOT ===")
plt.figure(figsize=(10, 6))

# Create scatter plot with coloring based on survival
scatter = plt.scatter(df['Age'], df['Fare'], 
                     c=df['Survived'], 
                     alpha=0.6, 
                     cmap='coolwarm',
                     s=50)  # s controls point size

plt.colorbar(scatter, label='Survived (0=No, 1=Yes)')
plt.title('Age vs Fare colored by Survival', fontsize=14, fontweight='bold')
plt.xlabel('Age')
plt.ylabel('Fare')
plt.grid(alpha=0.3)

plt.tight_layout()
plt.show()

print("üìä Scatter Plot Created: Age vs Fare (colored by survival)")

In [None]:
print("=== HISTOGRAM ===")
plt.figure(figsize=(12, 5))

# Plot 1: Age distribution
plt.subplot(1, 2, 1)
plt.hist(df['Age'], bins=30, color='lightblue', edgecolor='black', alpha=0.7)
plt.title('Distribution of Passenger Ages', fontweight='bold')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.grid(alpha=0.3)

# Plot 2: Fare distribution
plt.subplot(1, 2, 2)
plt.hist(df['Fare'], bins=30, color='lightcoral', edgecolor='black', alpha=0.7)
plt.title('Distribution of Fares', fontweight='bold')
plt.xlabel('Fare')
plt.ylabel('Frequency')
plt.grid(alpha=0.3)

plt.tight_layout()
plt.show()

print("üìä Histograms Created: Age and Fare distributions")

In [None]:
print("=== PLOT 1: SURVIVAL BY GENDER ===")
plt.figure(figsize=(8, 6))

# Calculate survival rates by gender
gender_survival = df.groupby('Sex')['Survived'].mean() * 100

bars = plt.bar(gender_survival.index, gender_survival.values, 
               color=['pink', 'lightblue'])

plt.title('Survival Rate by Gender', fontsize=14, fontweight='bold')
plt.xlabel('Gender')
plt.ylabel('Survival Rate (%)')
plt.ylim(0, 100)

# Add value labels
for bar, value in zip(bars, gender_survival.values):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 2, 
             f'{value:.1f}%', ha='center', va='bottom', fontweight='bold')

plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

print("""
üìù INSIGHT 1: Survival by Gender
- Females had a significantly higher survival rate than males
- This reflects the "women and children first" protocol during the evacuation
- Gender was a major factor in survival chances
""")

In [None]:
print("=== PLOT 2: AGE DISTRIBUTION BY SURVIVAL ===")
plt.figure(figsize=(10, 6))

# Create histogram for survivors vs non-survivors
plt.hist([df[df['Survived'] == 1]['Age'], df[df['Survived'] == 0]['Age']], 
         bins=20, label=['Survived', 'Did Not Survive'], 
         color=['green', 'red'], alpha=0.7, edgecolor='black')

plt.title('Age Distribution by Survival Status', fontsize=14, fontweight='bold')
plt.xlabel('Age')
plt.ylabel('Number of Passengers')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

print("""
üìù INSIGHT 2: Age Distribution by Survival
- Younger passengers (children) had higher survival rates
- The distribution shows concentration of survivors in lower age ranges
- Elderly passengers were less likely to survive
""")

In [None]:
print("=== PLOT 3: FARE VS AGE BY PASSENGER CLASS ===")
plt.figure(figsize=(10, 6))

# Create scatter plot colored by passenger class
scatter = plt.scatter(df['Age'], df['Fare'], 
                     c=df['Pclass'], 
                     cmap='viridis', 
                     alpha=0.6, 
                     s=40)

plt.colorbar(scatter, label='Passenger Class (1=Best, 3=Worst)')
plt.title('Fare vs Age colored by Passenger Class', fontsize=14, fontweight='bold')
plt.xlabel('Age')
plt.ylabel('Fare')
plt.yscale('log')  # Use log scale for better visualization
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

print("""
üìù INSIGHT 3: Fare vs Age by Class
- 1st class passengers paid significantly higher fares across all ages
- There's a clear stratification by class in terms of fare paid
- Younger 1st class passengers paid similar fares to older 1st class passengers
- Class privilege is clearly visible in the pricing structure
""")

In [None]:
print("=== PLOT 4: SURVIVAL BY EMBARKATION PORT ===")
plt.figure(figsize=(8, 6))

# Calculate survival rates by embarkation port
embark_survival = df.groupby('Embarked')['Survived'].mean() * 100

bars = plt.bar(embark_survival.index, embark_survival.values, 
               color=['lightblue', 'lightgreen', 'lightcoral'])

plt.title('Survival Rate by Embarkation Port', fontsize=14, fontweight='bold')
plt.xlabel('Embarkation Port (C=Cherbourg, Q=Queenstown, S=Southampton)')
plt.ylabel('Survival Rate (%)')

# Add value labels
for bar, value in zip(bars, embark_survival.values):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, 
             f'{value:.1f}%', ha='center', va='bottom', fontweight='bold')

plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

print("""
üìù INSIGHT 4: Survival by Embarkation Port
- Passengers from Cherbourg (C) had the highest survival rate
- Southampton (S) passengers had the lowest survival rate
- This could be related to the passenger class distribution at each port
- Location of embarkation may have influenced access to lifeboats
""")

In [None]:
print("=== PLOT 5: FAMILY SIZE VS SURVIVAL ===")
plt.figure(figsize=(10, 6))

# Create family size feature (SibSp + Parch)
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1  # +1 for the passenger themselves

# Create box plot
sns.boxplot(x='Survived', y='FamilySize', data=df, palette=['red', 'green'])
plt.title('Family Size Distribution by Survival Status', fontsize=14, fontweight='bold')
plt.xlabel('Survived (0=No, 1=Yes)')
plt.ylabel('Family Size')
plt.xticks([0, 1], ['Did Not Survive', 'Survived'])
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

print("""
üìù INSIGHT 5: Family Size vs Survival
- Survivors tended to have smaller family sizes on average
- Single passengers or those with very small families had better survival chances
- Larger families might have faced challenges staying together during evacuation
- The median family size for survivors is lower than for non-survivors
""")

In [None]:
print("=== BONUS: CORRELATION HEATMAP ===")
plt.figure(figsize=(10, 8))

# Calculate correlation matrix
correlation_matrix = df.select_dtypes(include=[np.number]).corr()

# Create heatmap
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=0.5)
plt.title('Correlation Heatmap of Numerical Features', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("""
üî• BONUS INSIGHT: Correlation Heatmap
- Fare and Pclass have strong negative correlation (higher class = higher fare)
- FamilySize has slight positive correlation with number of siblings/spouses
- Survival shows moderate correlation with Fare and Pclass
""")

In [None]:
# Save your visualization work
print("üíæ ASSIGNMENT 3 COMPLETE!")
print("5 different plots created with insights:")
print("1. Survival by Gender - Bar Plot")
print("2. Age Distribution by Survival - Histogram") 
print("3. Fare vs Age by Class - Scatter Plot")
print("4. Survival by Embarkation Port - Bar Plot")
print("5. Family Size vs Survival - Box Plot")
print("\nüìÅ Save this notebook as 'week3_data_visualization.ipynb'")
print("üöÄ Upload to GitHub to complete Assignment 3!")