In [3]:
import numpy as np
import pandas as pd

In [None]:
# SECTION A — BASIC OPERATIONS
# 1. Load the Titanic dataset into a pandas DataFrame.
df = pd.read_csv('titanic_data.csv.zip')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [14]:
# 2. Display the first 5 rows of the DataFrame.
df.head(5)
# 3. Display the last 5 rows of the DataFrame.
df.tail(5)
# 4. Print the number of rows and columns in the dataset.
df.shape
# 5. Display all column names and their data types.
df.info()
# 6. Show a statistical summary of all numerical columns.
df.describe()
# 7. Show the count of non-null values per column.
df.notnull().sum()
df.count()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64

In [None]:
# SECTION B — MISSING VALUES & CLEANING
# 1. Identify which columns contain missing values.
df.isnull().sum()
# 2. Count missing values in each column.
df.isna().sum()
# 3. Drop the Cabin column from the DataFrame.
df.drop(columns=['Cabin'])
# 4. Fill missing values in Age with the mean age.
df['Age'] = df['Age'].fillna(df['Age'].mean())
# 5. Fill missing values in Embarked with the most frequent port.
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
# 6. Check that there are no missing values left in Age and Embarked.
df[['Age', 'Embarked']].isnull().sum()
# 7. Remove duplicate rows if any.
df = df.drop_duplicates()

In [None]:
# SECTION C — DATA MANIPULATION
# 1. Convert the Sex column into a numeric format (e.g., male → 0, female → 1).
df['Sex'] = df['Sex'].replace({'male': 0, 'female': 1}, inplace = True)
# 2. Convert the Embarked column to a categorical datatype.
df['Embarked'] = df['Embarked'].astype('category')
# 3. Create a new column FamilySize = SibSp + Parch + 1.
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df
# 4. Create a new column IsAlone where 1 indicates the passenger is alone (FamilySize == 1), else 0.
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
# 5. Rename the column Pclass to PassengerClass.
df.rename(columns={'Pclass': 'PassengerClass'})
# 6. Convert Fare to integer values.
df['Fare'].fillna(0, inplace=True)
df['Fare'] = df['Fare'].astype(int)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Sex'] = df['Sex'].replace({'male': 0, 'female': 1}, inplace = True)


In [37]:
# SECTION D — FILTERING & SELECTION
# 1. Filter and display all passengers who survived (Survived == 1).
survived_passengers = df[df['Survived'] == 1]
# 2. Get passengers who are in 1st class (PassengerClass == 1).
first_class_passengers = df[df['PassengerClass'] == 1]
# 3. Filter passengers whose age is greater than 50.
older_passengers = df[df['Age'] > 50]
# 4. Filter female passengers who survived.
female_survivors = df[(df['Sex'] == 1) & (df['Survived'] == 1)]
# 5. Select passengers who paid a fare above the average fare.
avg_fare = df[df['Fare'] > df['Fare'].mean()]
# 6. Select only columns: Name, Sex, Age, Survived.
cols = df[['Name', 'Sex', 'Age', 'Survived']]
# 7. Using loc, select rows from index 100 to index 150 (inclusive).
rows = df.loc[100:151]
# 8. Using iloc, select the first 10 rows and first 6 columns.
rows_cols = df.iloc[:10, :6]


In [57]:
# SECTION E — SORTING & AGGREGATION
# 1. Sort passengers by Fare in descending order.
df.sort_values(by='Fare', ascending=False)
# 2. Sort passengers first by PassengerClass, then by Fare in descending order.
df.sort_values(by=['PassengerClass', 'Fare'], ascending=[True, False])
# 3. Find the average age of passengers.
df['Age'].mean()
# 4. Compute the average fare paid by each passenger class.
df.groupby('PassengerClass')['Fare'].mean()
# 5. Count the number of passengers in each embarkation port.
df['Embarked'].value_counts()
# 6. Count how many male and female passengers there are.

KeyError: 'PassengerClass'

In [55]:
df = pd.read_csv('titanic_data.csv.zip')
df
df['Sex'].head(10)
df['Sex'].unique()
df['Sex'].replace({'male': 0, 'female': 1}, inplace=True)
df['Sex'].value_counts()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Sex'].replace({'male': 0, 'female': 1}, inplace=True)
  df['Sex'].replace({'male': 0, 'female': 1}, inplace=True)


Sex
0    577
1    314
Name: count, dtype: int64

In [None]:
# SECTION F — GROUPBY & APPLY
# 1. Group the dataset by PassengerClass and compute:
# ○ mean age
# ○ mean fare
# ○ survival rate (mean of Survived)

# 2. Find the survival rate by gender.
df.groupby('Sex')['Survived'].mean()

# 3. Use apply() to create a new column AgeGroup:
# ○ "Child" if Age < 12
# ○ "Adult" if 12 ≤ Age < 60
# ○ "Senior" if Age ≥ 60
df['AgeGroup'] = df['Age'].apply(
    lambda x: 'Child' if x < 12 else ('Adult' if x < 60 else 'Senior'))
# 4. Reset the index of the grouped result from question 35.


KeyError: 'PassengerClass'

In [68]:
# SECTION G — ANALYSIS & EXPORT

# 1. Identify the passenger(s) who paid the highest fare.
max_fare = df['Fare'].max()
highest_fare_passengers = df[df['Fare'] == max_fare]
# 2. Find which embarkation port has the highest number of passengers.
df['Embarked'].value_counts()
# 3. Compare survival rate between passengers who were alone vs those with family.

# 4. Export the cleaned and processed DataFrame to a new CSV file named Titanic_Cleaned.csv.
df.to_csv('Titanic_Cleaned.csv', index=False)
