Read the original dataset

In [2]:
import pandas as pd

# Load original datasets
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Save PassengerId for test set
passenger_ids = test_df['PassengerId']

# Combine train and test for consistent processing
train_df['TrainSplit'] = 'train'
test_df['TrainSplit'] = 'test'
test_df['Survived'] = -1  # Dummy value to match columns

full_df = pd.concat([train_df, test_df], sort=False)

Feature Engineering

In [3]:
# Feature: FamilySize
full_df['FamilySize'] = full_df['SibSp'] + full_df['Parch'] + 1

# Feature: IsAlone
full_df['IsAlone'] = full_df['FamilySize'].apply(lambda x: 1 if x == 1 else 0)

# Feature: Title
full_df['Title'] = full_df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
full_df['Title'] = full_df['Title'].replace(['Lady', 'Countess','Capt', 'Col',
     'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
full_df['Title'] = full_df['Title'].replace(['Mlle', 'Ms'], 'Miss')
full_df['Title'] = full_df['Title'].replace('Mme', 'Mrs')

# Drop unused columns
full_df = full_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

# Fill missing values
full_df['Age'] = full_df['Age'].fillna(full_df['Age'].median())
full_df['Fare'] = full_df['Fare'].fillna(full_df['Fare'].median())
full_df['Embarked'] = full_df['Embarked'].fillna(full_df['Embarked'].mode()[0])

# One-hot encoding
full_df = pd.get_dummies(full_df, columns=['Sex', 'Embarked', 'Title'], drop_first=True)

# Split back to train/test
train_cleaned = full_df[full_df['TrainSplit'] == 'train'].drop(['TrainSplit'], axis=1)
test_cleaned = full_df[full_df['TrainSplit'] == 'test'].drop(['TrainSplit', 'Survived'], axis=1)

Save the data to csv

In [4]:
# Save to CSV
train_cleaned.to_csv("train_cleaned.csv", index=False)
test_cleaned.insert(0, "PassengerId", passenger_ids)  # Reinsert ID for submission
test_cleaned.to_csv("test_cleaned.csv", index=False)

print("Saved: train_cleaned.csv and test_cleaned.csv")

Saved: train_cleaned.csv and test_cleaned.csv
