In [1]:
# Cell 1: Load Titanic dataset
import pandas as pd
titanic = pd.read_csv('titanic.csv')
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [9]:
# Cell 2: Apply concat
first_half = titanic[:30]
second_half = titanic[300:]
combined = pd.concat([first_half, second_half], ignore_index=True)
combined.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
# Cell 3: Apply dropna (remove rows with missing Age)
dropped = combined.dropna(subset=['Age'])
dropped.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [11]:
# Cell 4: Apply fillna (fill missing Fare with 0)
filled = combined.fillna({'Fare': 0})
filled.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [12]:
# Cell 5: Apply melt (reshape class columns)
melted = pd.melt(filled, id_vars=['PassengerId'], value_vars=['Pclass', 'Fare'])
melted.head()

Unnamed: 0,PassengerId,variable,value
0,1,Pclass,3.0
1,2,Pclass,1.0
2,3,Pclass,3.0
3,4,Pclass,1.0
4,5,Pclass,3.0


In [13]:
# Cell 6: Apply merge (merge melted data with original titanic on PassengerId)
titanic_short = titanic[['PassengerId', 'Survived']]
merged = pd.merge(melted, titanic_short, on='PassengerId', how='left')
merged.head()

Unnamed: 0,PassengerId,variable,value,Survived
0,1,Pclass,3.0,0
1,2,Pclass,1.0,1
2,3,Pclass,3.0,1
3,4,Pclass,1.0,1
4,5,Pclass,3.0,0


In [14]:
# Cell 7: Apply pivot (reshape the merged table)
# Pivot the melted+merged DataFrame:
# - Group by each PassengerId (this becomes the row index)
# - Use values in the 'variable' column to create new output columns (e.g., 'Fare', 'Pclass')
# - For each (PassengerId, variable) pair, take the mean of the associated 'value'
# This reshapes the long-form data into a wide-format table.
pivoted = merged.pivot_table(index='PassengerId', columns='variable', values='value', aggfunc='mean')

pivoted.head()

variable,Fare,Pclass
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,7.25,3.0
2,71.2833,1.0
3,7.925,3.0
4,53.1,1.0
5,8.05,3.0


In [15]:
# Cell 8: Final groupby (group by index from pivoted result and average over pivoted values)
grouped = pivoted.groupby(pivoted.index).mean()
grouped.head()

variable,Fare,Pclass
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,7.25,3.0
2,71.2833,1.0
3,7.925,3.0
4,53.1,1.0
5,8.05,3.0


In this notebook, the final groupby operation (grouped = pivoted.groupby(pivoted.index).mean()) groups by the index (PassengerId) without explicitly selecting value columns. As a result, during replay, the extracted param.json will show "value": $[  ]$ to reflect the absence of an explicit column selection. However, this does not affect the operator graph: the dataflow correctly captures the lineage from the input DataFrame (pivoted), and any downstream operations will use all available columns from that groupby output as needed.