## Preprocessing and Pipelines

We will lean how to preprocess data and create machine learning pipelines using scikit-learn.

In [None]:
# Import core data manipulation libraries
import numpy as np
import pandas as pd

# Import scikit-learn utilities for model training and evaluation
from sklearn.model_selection import train_test_split  # Splits data into train/test sets
from sklearn.linear_model import LogisticRegression  # Binary classification model
from sklearn.metrics import accuracy_score  # Measures prediction accuracy

## Importing the Titanic Dataset

In this section we will import the Titanic dataset to demonstrate preprocessing and pipeline creation.

In [None]:
# Load the Titanic dataset from seaborn's built-in datasets
# This dataset contains passenger info and whether they survived
from seaborn import load_dataset

titanic = load_dataset("titanic")
titanic.head()  # Preview the first 5 rows

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [None]:
# Get summary statistics for numeric columns (count, mean, std, min, max, quartiles)
titanic.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [None]:
# Display column names, data types, and non-null counts
# Useful for identifying missing values and data types
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [None]:
# Remove the "deck" feature - it has too many missing values (77% null)
# Dropping columns with excessive missing data avoids imputation bias
titanic.drop(columns="deck", inplace=True)
titanic.head()

In [None]:
# Handle missing age values by filling with the mean (imputation)
# Mean imputation preserves the overall distribution while filling gaps
mean_age = titanic["age"].mean()
titanic["age"] = titanic["age"].fillna(value=mean_age)
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          891 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  embark_town  889 non-null    object  
 12  alive        891 non-null    object  
 13  alone        891 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(4), object(5)
memory usage: 79.4+ KB


In [None]:
# Drop rows with remaining missing values (only 2 rows missing "embark_town")
# When very few rows have missing data, dropping is simpler than imputation
titanic.dropna(inplace=True)
titanic.info()

<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     889 non-null    int64   
 1   pclass       889 non-null    int64   
 2   sex          889 non-null    object  
 3   age          889 non-null    float64 
 4   sibsp        889 non-null    int64   
 5   parch        889 non-null    int64   
 6   fare         889 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        889 non-null    category
 9   who          889 non-null    object  
 10  adult_male   889 non-null    bool    
 11  embark_town  889 non-null    object  
 12  alive        889 non-null    object  
 13  alone        889 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(4), object(5)
memory usage: 86.1+ KB


In [None]:
# Remove redundant columns that duplicate information already captured elsewhere
# "embarked" duplicates "embark_town", "class" duplicates "pclass", etc.
# Reducing redundancy simplifies the model and prevents multicollinearity
titanic.drop(columns=["embarked", "class", "who", "adult_male", "alive"], inplace=True)
titanic.info()

In [None]:
# Convert categorical "sex" to numeric using label encoding
# ML algorithms require numeric input; 0 = female, 1 = male
titanic["is_male"] = titanic["sex"].map({"female": 0, "male": 1})
titanic.head()

  titanic["is_male"] = titanic["sex"].replace({"female": 0, "male": 1})


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone,is_male
0,0,3,male,22.0,1,0,7.25,Southampton,False,1
1,1,1,female,38.0,1,0,71.2833,Cherbourg,False,0
2,1,3,female,26.0,0,0,7.925,Southampton,True,0
3,1,1,female,35.0,1,0,53.1,Southampton,False,0
4,0,3,male,35.0,0,0,8.05,Southampton,True,1


In [None]:
# Remove the original "sex" column since we now have the numeric "is_male"
titanic.drop(columns=["sex"], inplace=True)
titanic.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,embark_town,alone,is_male
0,0,3,22.0,1,0,7.25,Southampton,False,1
1,1,1,38.0,1,0,71.2833,Cherbourg,False,0
2,1,3,26.0,0,0,7.925,Southampton,True,0
3,1,1,35.0,1,0,53.1,Southampton,False,0
4,0,3,35.0,0,0,8.05,Southampton,True,1


In [None]:
# Calculate correlation matrix to identify relationships between features
# Values close to +1 or -1 indicate strong relationships; close to 0 means weak
titanic.corr(numeric_only=True)

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,is_male
survived,1.0,-0.335549,-0.074673,-0.03404,0.083151,0.25529,-0.206207,-0.541585
pclass,-0.335549,1.0,-0.327954,0.081656,0.016824,-0.548193,0.138553,0.127741
age,-0.074673,-0.327954,1.0,-0.231875,-0.178232,0.088604,0.177712,0.089434
sibsp,-0.03404,0.081656,-0.231875,1.0,0.414542,0.160887,-0.584186,-0.116348
parch,0.083151,0.016824,-0.178232,0.414542,1.0,0.217532,-0.583112,-0.247508
fare,0.25529,-0.548193,0.088604,0.160887,0.217532,1.0,-0.274079,-0.179958
alone,-0.206207,0.138553,0.177712,-0.584186,-0.583112,-0.274079,1.0,0.306985
is_male,-0.541585,0.127741,0.089434,-0.116348,-0.247508,-0.179958,0.306985,1.0


#### What "low correlation" means:
- Values close to 0 (like -0.07, -0.04, 0.08) indicate almost no linear relationship
- These features don't help predict survival much on their own
- age, sibsp, and parch all have correlations under ±0.1

#### What the sign tells you:
- Positive (+0.26): As fare goes up, survival goes up
- Negative (-0.54): As is_male goes up (male), survival goes down

#### Why drop them?
- They add complexity without adding predictive power
- Simpler models often generalize better
- Fewer features = faster training, less overfitting

Caveat: Low correlation doesn't always mean useless. Age might matter in combination with other features (e.g., young children with parents). But for a simple logistic regression, dropping low-correlation features is a reasonable choice.

#### What matters for dropping features:
- The absolute value (distance from 0)
- |-0.54| = 0.54 → strong relationship
- |-0.07| = 0.07 → weak relationship

In [None]:
# Drop features with low correlation to "survived" (absolute value < 0.1)
# Also drop "embark_town" since it's categorical and not yet encoded
# Keeping only relevant features reduces noise and improves model performance
titanic.drop(columns=["age", "sibsp", "parch", "embark_town"], inplace=True)

In [None]:
# Verify the cleaned dataset - should have: survived, pclass, fare, is_male
titanic.head()

Unnamed: 0,survived,pclass,fare,alone,is_male
0,0,3,7.25,False,1
1,1,1,71.2833,False,0
2,1,3,7.925,True,0
3,1,1,53.1,False,0
4,0,3,8.05,True,1


### Standardizing the Values
It is useful to standardize features so they have a mean of 0 and a standard deviation of 1. This ensures that all features contribute equally to the distance calculations in algorithms like logistic regression.

In [None]:
# Split data into features (X) and target (y)
# X = what we use to predict, y = what we're predicting (survived)
y = titanic["survived"]
X = titanic.drop(columns=["survived"])

# Split into training (67%) and testing (33%) sets
# random_state ensures reproducible results
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)

In [None]:
# Import StandardScaler to normalize features to mean=0, std=1
# This prevents features with large values from dominating the model
from sklearn.preprocessing import StandardScaler

In [None]:
# Create and fit the scaler on training data only
# fit() learns the mean and variance from training data
scaler = StandardScaler()
scaler.fit(X_train)

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [None]:
# Inspect the learned statistics from the training data
# These values will be used to transform both train and test sets
print("Mean per feature:", scaler.mean_)
print("Variance per feature:", scaler.var_)

array([6.87550314e-01, 2.64921948e+03, 2.41299343e-01, 2.24238401e-01])

In [None]:
# Apply the same scaling transformation to both train and test sets
# Important: Use transform() only (not fit_transform) on test data
# This ensures test data is scaled using training statistics
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Verify the scaled output - values should now be centered around 0
# Notice the output is a numpy array, not a DataFrame
X_train_scaled

array([[-1.60124536,  0.37097199,  0.82798092,  0.71693438],
       [-0.39524411, -0.38407115,  0.82798092, -1.39482779],
       [-1.60124536,  0.95374775, -1.2077573 ,  0.71693438],
       ...,
       [-1.60124536, -0.13287517,  0.82798092, -1.39482779],
       [ 0.81075714,  0.03121472, -1.2077573 , -1.39482779],
       [ 0.81075714, -0.46850387,  0.82798092,  0.71693438]],
      shape=(595, 4))

The flow:
- `fit(X_train)` - Learn the statistics (mean, variance) from the training data
- `transform(X_train)` - Apply those statistics to scale the data

Think of it like this:
- fit()       →  "Measure the ruler" (learn mean=30, std=10)
- transform() →  "Use the ruler to measure" (apply the scaling formula)


In [None]:
# Create and train the logistic regression model
# fit() learns the relationship between scaled features and survival
log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train)  # X is scaled, y stays as 0/1

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [None]:
# Evaluate the model on unseen test data
# predict() returns 0 or 1 for each test sample
y_pred = log_reg.predict(X_test_scaled)

# Calculate accuracy: (correct predictions) / (total predictions)
accuracy_score(y_test, y_pred)

0.7993197278911565

In [None]:
# Calculate the baseline accuracy (survival rate in the dataset)
# If we predicted "died" for everyone, we'd be right ~62% of the time
# Our model should beat this baseline to be considered useful
titanic["survived"].mean()  # ~0.38 survived, so predicting 0 = 62% accuracy

np.float64(0.38245219347581555)

## Create a Pipeline for Preprocessing and Modeling
We will create a pipeline that combines preprocessing steps (like standardization) with a machine learning model (like logistic regression). This allows us to streamline the workflow and ensure that all steps are applied consistently during training and evaluation.

In [None]:
# Import the Pipeline object
from sklearn.pipeline import Pipeline

In [None]:
# Create a pipeline
pipeline = Pipeline(
    [("scaler", StandardScaler()), ("log_reg", LogisticRegression())], verbose=True
)

In [43]:
# Fit the pipeline
pipeline.fit(X_train, y_train)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ........... (step 2 of 2) Processing log_reg, total=   0.0s


0,1,2
,steps,"[('scaler', ...), ('log_reg', ...)]"
,transform_input,
,memory,
,verbose,True

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [None]:
# View all the steps
pipeline.named_steps

{'scaler': StandardScaler(), 'log_reg': LogisticRegression()}

In [47]:
# View mean
pipeline.named_steps["scaler"].mean_

array([ 2.32773109, 32.76836067,  0.59327731,  0.6605042 ])

In [48]:
# Predict with auto scaling of X_test dataset
pipeline.predict(X_test)

array([0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0])

In [49]:
pipeline.score(X_test, y_test)

0.7993197278911565