In [1]:
# ----------------------------------
# DAY 5: PANDAS ADVANCED
# ----------------------------------

import pandas as pd
import numpy as np

print("ðŸš€ Day 5: Pandas Advanced ðŸš€\n")

# Load cleaned Titanic dataset (or load fresh if needed)
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

# Basic cleaning again (safe practice)
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# Create FamilySize again
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1


ðŸš€ Day 5: Pandas Advanced ðŸš€



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)


groupby() with Multiple Aggregations

In [2]:
# Survival rate & average fare by class
grouped = df.groupby('Pclass').agg({
    'Survived': 'mean',
    'Fare': 'mean',
    'Age': 'mean'
})

print(grouped)


        Survived       Fare        Age
Pclass                                
1       0.629630  84.154687  37.048118
2       0.472826  20.662183  29.866958
3       0.242363  13.675550  26.403259


pivot_table() (VERY IMPORTANT)

In [3]:
# Survival rate by Sex and Pclass
pivot = pd.pivot_table(
    df,
    values='Survived',
    index='Sex',
    columns='Pclass',
    aggfunc='mean'
)

print(pivot)


Pclass         1         2         3
Sex                                 
female  0.968085  0.921053  0.500000
male    0.368852  0.157407  0.135447


apply() â€“ Custom Feature Engineering

In [4]:
# Create category from FamilySize
def family_type(size):
    if size == 1:
        return "Solo"
    elif size <= 4:
        return "Small Family"
    else:
        return "Large Family"

df['FamilyType'] = df['FamilySize'].apply(family_type)

df[['FamilySize', 'FamilyType']].head()


Unnamed: 0,FamilySize,FamilyType
0,2,Small Family
1,2,Small Family
2,1,Solo
3,2,Small Family
4,1,Solo


map() & replace()

In [5]:
# Convert Sex into numeric
df['Sex_num'] = df['Sex'].map({'male': 0, 'female': 1})

# Replace Embarked letters with full names
df['Embarked_Full'] = df['Embarked'].replace({
    'C': 'Cherbourg',
    'Q': 'Queenstown',
    'S': 'Southampton'
})

df[['Sex', 'Sex_num', 'Embarked', 'Embarked_Full']].head()


Unnamed: 0,Sex,Sex_num,Embarked,Embarked_Full
0,male,0,S,Southampton
1,female,1,C,Cherbourg
2,female,1,S,Southampton
3,female,1,S,Southampton
4,male,0,S,Southampton


cut() & qcut() (Binning Data)

In [7]:
# Age bins
df['AgeGroup'] = pd.cut(
    df['Age'],
    bins=[0, 12, 20, 40, 60, 100],
    labels=['Child', 'Teen', 'Adult', 'Middle-Aged', 'Senior']
)

# Fare bins (quantile-based)
df['FareGroup'] = pd.qcut(df['Fare'], 4, labels=['Low', 'Medium', 'High', 'Very High'])

df[['Age', 'AgeGroup', 'Fare', 'FareGroup']].head()


Unnamed: 0,Age,AgeGroup,Fare,FareGroup
0,22.0,Adult,7.25,Low
1,38.0,Adult,71.2833,Very High
2,26.0,Adult,7.925,Medium
3,35.0,Adult,53.1,Very High
4,35.0,Adult,8.05,Medium


sort_values() & rank()

In [8]:
# Sort by Fare
df.sort_values(by='Fare', ascending=False).head()

# Rank passengers by fare
df['FareRank'] = df['Fare'].rank(ascending=False)

df[['Fare', 'FareRank']].head()


Unnamed: 0,Fare,FareRank
0,7.25,815.0
1,71.2833,103.0
2,7.925,659.5
3,53.1,144.0
4,8.05,628.0


Filtering with Multiple Conditions

In [9]:
# Female passengers in 1st class who survived
df[(df['Sex'] == 'female') & (df['Pclass'] == 1) & (df['Survived'] == 1)].head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,FamilyType,Sex_num,Embarked_Full,AgeGroup,FareGroup,FareRank
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2,Small Family,1,Cherbourg,Adult,Very High,103.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,2,Small Family,1,Southampton,Adult,Very High,144.0
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.55,C103,S,1,Solo,1,Southampton,Middle-Aged,High,277.0
31,32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,29.699118,1,0,PC 17569,146.5208,B78,C,2,Small Family,1,Cherbourg,Adult,Very High,30.5
52,53,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49.0,1,0,PC 17572,76.7292,D33,C,2,Small Family,1,Cherbourg,Middle-Aged,Very High,94.0


Mini Project: Advanced Insights

In [10]:
print("== Advanced Insights ==")

# Survival rate by FamilyType
print("\nSurvival Rate by Family Type:")
print(df.groupby('FamilyType')['Survived'].mean())

# Survival rate by FareGroup
print("\nSurvival Rate by Fare Group:")
print(df.groupby('FareGroup')['Survived'].mean())

# Survival rate by AgeGroup & Sex
print("\nSurvival Rate by Age Group and Sex:")
print(pd.pivot_table(df, values='Survived', index='AgeGroup', columns='Sex', aggfunc='mean'))


== Advanced Insights ==

Survival Rate by Family Type:
FamilyType
Large Family    0.161290
Small Family    0.578767
Solo            0.303538
Name: Survived, dtype: float64

Survival Rate by Fare Group:
FareGroup
Low          0.197309
Medium       0.303571
High         0.454955
Very High    0.581081
Name: Survived, dtype: float64

Survival Rate by Age Group and Sex:
Sex            female      male
AgeGroup                       
Child        0.593750  0.567568
Teen         0.755556  0.123077
Adult        0.756614  0.166220
Middle-Aged  0.755556  0.192771
Senior       1.000000  0.105263


  print(df.groupby('FareGroup')['Survived'].mean())
  print(pd.pivot_table(df, values='Survived', index='AgeGroup', columns='Sex', aggfunc='mean'))


Save Advanced Dataset

In [11]:
df.to_csv("titanic_advanced_features.csv", index=False)
print("\nAdvanced dataset saved as titanic_advanced_features.csv")



Advanced dataset saved as titanic_advanced_features.csv
