# Titanic Survival Prediction - Data Cleaning Pipeline

In [19]:
import pandas as pd
from src.data_loading import load_data
from src.preprocessing import (
    extract_title,
    add_family_features,
    fill_missing_embarked,
    fill_missing_age,
    simplify_cabin_column,
    add_age_band,
    add_fare_band,
    encode_categoricals
)

In [17]:
from src.preprocessing import drop_unused_columns


In [5]:
# STEP 1: Load Raw Data
train_df, test_df = load_data()

In [6]:
# STEP 2: Feature Engineering Pipeline
train_df = extract_title(train_df)
train_df = add_family_features(train_df)
train_df = fill_missing_embarked(train_df)
train_df = fill_missing_age(train_df)
train_df = simplify_cabin_column(train_df)
train_df = add_age_band(train_df)
train_df = add_fare_band(train_df)
train_df = encode_categoricals(train_df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(mode, inplace=True)


In [24]:
features = ['Pclass', 'Sex', 'Age', 'Fare', 'Cabin', 'Embarked', 'Title', 'FamilySize', 'IsAlone']
X_test = test_df[features]


KeyError: "['Title', 'FamilySize', 'IsAlone'] not in index"

In [20]:
# STEP 3: Inspect Output
print("Preview of cleaned dataset:")
display(train_df.head())

print("Missing values check:")
display(train_df.isnull().sum())

print("Column types:")
display(train_df.dtypes)

Preview of cleaned dataset:


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,FamilySize,IsAlone,AgeBand,FareBand
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,0,0,0,2,0,1,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,1,1,1,2,0,2,3
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,0,0,2,1,1,2,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,1,0,1,2,0,2,3
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,0,0,0,1,1,2,1


Missing values check:


PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
Title          0
FamilySize     0
IsAlone        0
AgeBand        0
FareBand       0
dtype: int64

Column types:


PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex              int64
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin            int64
Embarked         int64
Title            int64
FamilySize       int64
IsAlone          int64
AgeBand          int64
FareBand         int64
dtype: object

In [7]:
# STEP 4: Preview Useful Features
print("Selected features for modeling:")
display(train_df[["Survived", "Pclass", "Sex", "Age", "Fare", "Title", "FamilySize", "IsAlone", "Embarked"]].head())

Selected features for modeling:


Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Title,FamilySize,IsAlone,Embarked
0,0,3,0,22.0,7.25,0,2,0,0
1,1,1,1,38.0,71.2833,1,2,0,1
2,1,3,1,26.0,7.925,2,1,1,0
3,1,1,1,35.0,53.1,1,2,0,0
4,0,3,0,35.0,8.05,0,1,1,0


In [8]:
from src.preprocessing import drop_unused_columns

# Choose whether to keep Age/Fare
train_df = drop_unused_columns(train_df, keep_continuous=True)

train_df.head()


Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Cabin,Embarked,Title,FamilySize,IsAlone,AgeBand,FareBand
0,0,3,0,22.0,7.25,0,0,0,2,0,1,0
1,1,1,1,38.0,71.2833,1,1,1,2,0,2,3
2,1,3,1,26.0,7.925,0,0,2,1,1,2,1
3,1,1,1,35.0,53.1,1,0,1,2,0,2,3
4,0,3,0,35.0,8.05,0,0,0,1,1,2,1


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# STEP 1: Choose features and target
features = ['Pclass', 'Sex', 'Age', 'Fare', 'Cabin', 'Embarked', 'Title', 'FamilySize', 'IsAlone']
X = train_df[features]
y = train_df['Survived']

# STEP 2: Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# STEP 3: Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# STEP 4: Evaluate
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)

print("Validation Accuracy:", accuracy)
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred))


Validation Accuracy: 0.8100558659217877

Confusion Matrix:
 [[90 15]
 [19 55]]

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.86      0.84       105
           1       0.79      0.74      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179



In [8]:
_, test_df = load_data()

In [11]:
test_df = drop_unused_columns(test_df, keep_continuous=True)

In [25]:
X_test = test_df[features]

KeyError: "['Title', 'FamilySize', 'IsAlone'] not in index"

In [26]:
test_preds = model.predict(X_test)


NameError: name 'X_test' is not defined

In [15]:
test_preds = model.predict(X_test)


NameError: name 'model' is not defined

In [21]:
test_preds = model.predict(X_test)


NameError: name 'model' is not defined

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Redefine features and target
features = ['Pclass', 'Sex', 'Age', 'Fare', 'Cabin', 'Embarked', 'Title', 'FamilySize', 'IsAlone']
X = train_df[features]
y = train_df['Survived']

# Train/test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


In [23]:
test_preds = model.predict(X_test)


NameError: name 'X_test' is not defined