Dataset: Titanic  
Link: https://www.kaggle.com/c/titanic/data  
Target: Survived


**PART 0 – Import Libraries**

In [18]:
# Allowed libraries only
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer



In [19]:
# Load same Kaggle dataset used in Assignment 13
df = pd.read_csv("train.csv")

df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


PART 1 – FEATURE ENGINEERING

In [20]:
#Task 1 – Creating New Features
# Create new meaningful features

# 1. Family Size = SibSp + Parch + 1
df['family_size'] = df['SibSp'] + df['Parch'] + 1

# 2. Fare per person
df['fare_per_person'] = df['Fare'] / df['family_size']

# Add age group feature
df['age_group'] = pd.cut(df['Age'],
                         bins=[0,12,18,35,60,100],
                         labels=['child','teen','adult','middle','senior'])

df[['Age','family_size','fare_per_person','age_group']].head()


Unnamed: 0,Age,family_size,fare_per_person,age_group
0,22.0,2,3.625,adult
1,38.0,2,35.64165,middle
2,26.0,1,7.925,adult
3,35.0,2,26.55,adult
4,35.0,1,8.05,adult


In [21]:
#Task 2 – Handling Date & Text Features
# Titanic dataset has no date column
# Text column available: Name

# Extract length of name as text feature
df['name_length'] = df['Name'].apply(len)

# Extract title from name
df['title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

df[['Name','name_length','title']].head()


  df['title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)


Unnamed: 0,Name,name_length,title
0,"Braund, Mr. Owen Harris",23,Mr
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",51,Mrs
2,"Heikkinen, Miss. Laina",22,Miss
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",44,Mrs
4,"Allen, Mr. William Henry",24,Mr


PART 2 – FEATURE ENCODING

In [22]:
#Task 3 – One Hot Encoding using get_dummies
# Simple encoding for understanding
encoded_df = pd.get_dummies(df[['Sex','Embarked','Pclass']])

encoded_df.head()


Unnamed: 0,Pclass,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,False,True,False,False,True
1,1,True,False,True,False,False
2,3,True,False,False,False,True
3,1,True,False,False,False,True
4,3,False,True,False,False,True


In [23]:
#Task 4 – ColumnTransformer (Recommended)
# Separate features

numerical_features = ['Age','Fare','family_size','fare_per_person','name_length']

categorical_features = ['Sex','Embarked','Pclass','age_group','title']

# Define transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num','passthrough', numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Fit transform
X_transformed = preprocessor.fit_transform(df)

print("Transformed Shape:", X_transformed.shape)


Transformed Shape: (891, 37)


PART 3 – FEATURE SCALING

In [24]:
#Task 5 – StandardScaler
scaler = StandardScaler()

scaled_standard = scaler.fit_transform(df[numerical_features])

print("Mean after scaling (approx 0):")
print(scaled_standard.mean())

print("Std after scaling (approx 1):")
print(scaled_standard.std())


Mean after scaling (approx 0):
nan
Std after scaling (approx 1):
nan


In [25]:
#Task 6 – MinMaxScaler
minmax = MinMaxScaler()

scaled_minmax = minmax.fit_transform(df[numerical_features])

print("Min value:", scaled_minmax.min())
print("Max value:", scaled_minmax.max())


Min value: nan
Max value: nan


PART 4 – BUILDING PIPELINE

In [28]:
#Task 7 – Preprocessing Pipeline
# Numerical pipeline
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])


cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  # <-- FIX
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])


# Combine using ColumnTransformer
preprocess_pipeline = ColumnTransformer([
    ('num', num_pipeline, numerical_features),
    ('cat', cat_pipeline, categorical_features)
])


In [31]:
#Task 8 – Full Scikit-learn Pipeline
# Define final pipeline
model_pipeline = Pipeline([
    ('preprocess', preprocess_pipeline),
    ('model', LogisticRegression(max_iter=1000))
])

# Prepare data
X = df[numerical_features + categorical_features]
y = df['Survived']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Fit pipeline
model_pipeline.fit(X_train, y_train)

# Predictions
pred = model_pipeline.predict(X_test)

print("Pipeline executed successfully")


Pipeline executed successfully


**TASK 9 – CONCEPTUAL ANSWERS**


### 1. Why pipelines are important in ML?

* Automates the full preprocessing and modeling workflow
* Ensures same steps for train and test data
* Keeps code clean and organized
* Makes the model reusable and easy to deploy
* Reduces chances of manual mistakes


### 2. What problems do pipelines solve?

* Prevents data leakage between train and test
* Avoids repeating preprocessing code
* Maintains consistent transformations
* Simplifies model deployment
* Reduces human errors



### 3. Manual preprocessing vs pipeline-based preprocessing

**Manual preprocessing**

* Steps written separately
* Risk of train–test mismatch
* Hard to reuse code
* More error-prone

**Pipeline preprocessing**

* All steps combined in one flow
* Same process for train and test
* Easy to reuse and modify
* More reliable and structured


