# Chapter 7 - Preprocess Data

In [2]:
# Basic Libraries

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns; sns.set()

import pandas as pd
import numpy as np

In [3]:
# Specific Libraries

from sklearn.experimental import (
    enable_iterative_imputer,
)
from sklearn import (
    ensemble,
    impute,
    model_selection,    
    preprocessing,
    tree,
)

### New Dataset: X2

In [5]:
X2 = pd.DataFrame(
    {
        "a": range(5),
        "b": [-100, -50, 0, 200, 1000],
    }
)

X2

Unnamed: 0,a,b
0,0,-100
1,1,-50
2,2,0
3,3,200
4,4,1000


### Standardize

Some algorithms, such as SVM, perform better when the data is standardized. Each column should have a mean value of 0 and standard deviation of 1.

In [7]:
from sklearn import preprocessing   

std = preprocessing.StandardScaler()  # Initialize a StandardScaler object, which standardizes features by removing the mean and 
                                      # scaling to unit variance
std.fit_transform(X2)                 # Fit the scaler to the data in 'X2' and apply the transformation to standardize the features

array([[-1.41421356, -0.75995002],
       [-0.70710678, -0.63737744],
       [ 0.        , -0.51480485],
       [ 0.70710678, -0.02451452],
       [ 1.41421356,  1.93664683]])

In [8]:
# Standard Deviations before scaling
std.scale_

array([  1.41421356, 407.92156109])

In [9]:
# Mean before scaling
std.mean_

array([  2., 210.])

In [10]:
# Variance before scaling
std.var_

array([2.000e+00, 1.664e+05])

In [11]:
# Check : 'std.scale_' are the square roots of the values in 'std.var_'.
std.scale_**2

array([2.000e+00, 1.664e+05])

In [12]:
X2

Unnamed: 0,a,b
0,0,-100
1,1,-50
2,2,0
3,3,200
4,4,1000


In [13]:
# Pandas version
X_std = (X2 - X2.mean()) / X2.std()
X_std

Unnamed: 0,a,b
0,-1.264911,-0.67972
1,-0.632456,-0.570088
2,0.0,-0.460455
3,0.632456,-0.021926
4,1.264911,1.73219


In [14]:
X_std.mean()

a    4.440892e-17
b    0.000000e+00
dtype: float64

In [15]:
X_std.std()

a    1.0
b    1.0
dtype: float64

In [16]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X2)

# Calculate the mean of each column in the original DataFrame
print("Original Means:")
print(X2.mean())
print()

# Calculate the standard deviation of each column in the original DataFrame
print("Original Standard Deviations:")
print(X2.std())

Original Means:
a      2.0
b    210.0
dtype: float64

Original Standard Deviations:
a      1.581139
b    456.070170
dtype: float64


In [17]:
X2.describe()

Unnamed: 0,a,b
count,5.0,5.0
mean,2.0,210.0
std,1.581139,456.07017
min,0.0,-100.0
25%,1.0,-50.0
50%,2.0,0.0
75%,3.0,200.0
max,4.0,1000.0


In [18]:
X2.count()

a    5
b    5
dtype: int64

### Scale to Range

Scaling to range is translating data so it is between 0 and 1, inclusive. Having the data bounded may be useful. However, if
you have outliers, you probably want to be careful using this:

In [21]:
from sklearn import preprocessing

mms = preprocessing.MinMaxScaler()  # Initialize the MinMaxScaler, which scales data to a specified range (default is [0, 1])
mms.fit(X2)                         # Compute the minimum and maximum values from X2 to fit the scaler
mms.transform(X2)                   # Transform X2 by scaling it to the specified range using the fitted scaler

array([[0.        , 0.        ],
       [0.25      , 0.04545455],
       [0.5       , 0.09090909],
       [0.75      , 0.27272727],
       [1.        , 1.        ]])

In [22]:
# Pandas Version:

(X2 - X2.min()) / (X2.max() - X2.min())

Unnamed: 0,a,b
0,0.0,0.0
1,0.25,0.045455
2,0.5,0.090909
3,0.75,0.272727
4,1.0,1.0


### Dummy Variables

We can use pandas to create dummy variables from categorical data. This is also referred to as one-hot encoding, or indicator
encoding. Dummy variables are especially useful if the data is nominal (unordered). The get_dummies function in pandas cre‐
ates multiple columns for a categorical column, each with a 1 or 0 if the original column had that value:

In [25]:
X_cat = pd.DataFrame(
    {
        "name": ["George", "Paul"],
        "inst": ["Bass", "Guitar"],
    }
)

X_cat

Unnamed: 0,name,inst
0,George,Bass
1,Paul,Guitar


Here is the pandas version. Note the `drop_first` option can be
used to eliminate a column (one of the dummy columns is a
linear combination of the other columns):

In [27]:
pd.get_dummies(X_cat, drop_first=True).astype(int)

Unnamed: 0,name_Paul,inst_Guitar
0,0,0
1,1,1


In [28]:
import janitor as jn

X_cat2 = pd.DataFrame(      # Create a DataFrame named 'X_cat2'
    {                       # Define columns for the DataFrame
        "A": [1, None, 3],  # Column 'A' with values [1, None, 3]
        "names": [          # Column 'names' with comma-separated values
            "Fred,George",  # First row with "Fred,George"
            "George",       # Second row with "George"
            "John,Paul",    # Third row with "John,Paul"
        ],
    }
)

# Use janitor's expand_column to split 'names' column
jn.expand_column(X_cat2, "names", sep=",")   # Split values in 'names' by comma, creating separate columns for each

Unnamed: 0,A,names,Fred,George,John,Paul
0,1.0,"Fred,George",1,1,0,0
1,,George,0,1,0,0
2,3.0,"John,Paul",0,0,1,1


### Label Encoder

An alternative to dummy variable encoding is label encoding.
This will take categorical data and assign each value a number.
It is useful for high cardinality data. This encoder imposes ordinality, which may or may not be desired. It can take up less
space than one-hot encoding, and some (tree) algorithms can
deal with this encoding.
    
The label encoder can only deal with one column at a time:

In [31]:
from sklearn import preprocessing  

lab = preprocessing.LabelEncoder() # Create an instance of LabelEncoder
lab.fit_transform(X_cat.name)      # Fit and transform the 'name' column in X_cat

array([0, 1])

In [32]:
# See original labels
lab.classes_

array(['George', 'Paul'], dtype=object)

In [33]:
X_cat

Unnamed: 0,name,inst
0,George,Bass
1,Paul,Guitar


In [34]:
lab.inverse_transform([1, 1, 0])

array(['Paul', 'Paul', 'George'], dtype=object)

In [35]:
# Using Pandas

X_cat.name.astype(
    "category"
).cat.as_ordered().cat.codes + 1

0    1
1    2
dtype: int8

### Frequency Encoding

Another option for handling high cardinality categorical data is
to *frequency encode* it. This means replacing the name of the
category with the count it had in the training data. We will use
pandas to do this. First, we will use the pandas `.value_counts`
method to make a mapping (a pandas series that maps strings
to counts). With the mapping we can use the `.map` method to
do the encoding:

In [38]:
mapping = X_cat.name.value_counts()
X_cat.name.map(mapping)

0    1
1    1
Name: name, dtype: int64

Each name shows up one time. (Make sure you store the training mapping so you can encode future data with the same data.)

### Pulling Categories from Strings

One way to increase the accuracy of the Titanic model is to pull
out titles from the names. A quick hack to find the most com‐
mon triples is to use the Counter class:

In [42]:
# Titanic Dataframe

url = ("https://raw.githubusercontent.com/joanby/python-ml-course/refs/heads/master/datasets/titanic/titanic3.csv")
df = pd.read_csv(url)

In [43]:
from collections import Counter

c = Counter()                       # Initialize a Counter to store occurrences of each 3-character sequence

def triples(val):                   # Define a function to find and count 3-character sequences
    for i in range(len(val)):       # Loop through the string based on its length
        c[val[i : i + 3]] += 1      # Increment count in Counter for each 3-character slice

df.name.apply(triples)              # Apply the triples function to each value in the 'name' column of DataFrame 'df'
c.most_common(10)                   # Retrieve the 10 most common 3-character sequences with their counts

[(', M', 1282),
 (' Mr', 954),
 ('r. ', 830),
 ('Mr.', 757),
 ('s. ', 460),
 ('n, ', 320),
 (' Mi', 283),
 ('iss', 261),
 ('ss.', 261),
 ('Mis', 260)]

Another option is to use a regular expression to pull out the capital letter followed by lowercase letters and a period:

In [45]:
df.name.str.extract(                # Use 'str.extract' to search for a specific pattern in the 'name' column
    r"([A-Za-z]+)\.", expand=False  # Raw string regex pattern: captures one or more alphabetic characters followed by a period
).head()                            # Display the first few results using .head()

0      Miss
1    Master
2      Miss
3        Mr
4       Mrs
Name: name, dtype: object

In [46]:
df.name.str.extract(                # Use 'str.extract' to search for a specific pattern in the 'name' column
    r"([A-Za-z]+)\.", expand=False  # Raw string regex pattern: captures one or more alphabetic characters followed by a period
).value_counts()                    # Display the frenquency

name
Mr          757
Miss        260
Mrs         197
Master       61
Rev           8
Dr            8
Col           4
Mlle          2
Ms            2
Major         2
Capt          1
Sir           1
Dona          1
Jonkheer      1
Countess      1
Don           1
Mme           1
Lady          1
Name: count, dtype: int64

### Other Categorical Encoding

Hash encoder is useful if you don’t know how many categories you have ahead of time or if you are using a bag of words to represent
text. This will hash the categorical columns into n_components. If you are using online learning (models that can be updated), this can be very useful:

In [49]:
X_cat

Unnamed: 0,name,inst
0,George,Bass
1,Paul,Guitar


In [50]:
import category_encoders as ce

he = ce.HashingEncoder(verbose=1)  # Initialize HashingEncoder with verbose=1 to display transformation progress
he.fit_transform(X_cat)            # Fit the encoder to 'X_cat' and transform it, producing hashed encodings for categorical data

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7
0,0,0,0,1,0,1,0,0
1,0,2,0,0,0,0,0,0


In [51]:
size_df = pd.DataFrame(                     # Create a DataFrame 'size_df' with sample data
    {                                       # Define columns and their values
        "name": ["Fred", "John", "Matt"],   # 'name' column with individual names
        "size": ["small", "med", "xxl"],    # 'size' column with size categories (e.g., small, med, xxl)
    }
)

size_df

Unnamed: 0,name,size
0,Fred,small
1,John,med
2,Matt,xxl


In [52]:
ore = ce.OrdinalEncoder(                    # Initialize an OrdinalEncoder from category_encoders
    mapping=[                               # Provide a custom mapping for encoding
        {                                   # Specify the mapping as a dictionary
            "col": "size",                  # Column to encode is 'size'
            "mapping": {                    # Define the mapping for each size category
                "small": 1,                 # Map "small" to 1
                "med": 2,                   # Map "med" to 2
                "lg": 3,                    # Map "lg" to 3; values not in the mapping (e.g., "xxl") will be set to NaN or handled accordingly
            },
        }
    ]
)

ore.fit_transform(size_df)                  # Fit the encoder on 'size_df' and transform 'size' column, replacing categories with mapped values

Unnamed: 0,name,size
0,Fred,1.0
1,John,2.0
2,Matt,-1.0


To convert the Titanic survival column to a blend of posterior probability of the target and the prior probability given the title (categorical) information, use the following code:

In [54]:
def get_title(df):                        # Define a function to extract titles from names
    return df.name.str.extract(           # Use regular expression to extract titles from the 'name' column
        r"([A-Za-z]+)\.", expand=False    # Regex pattern: matches alphabetic characters followed by a period
    )

te = ce.TargetEncoder(cols="Title")       # Initialize a TargetEncoder from category_encoders for the 'Title' column

te.fit_transform(                         # Fit and transform the encoder with the DataFrame
    df.assign(Title=get_title),           # Temporarily add the extracted 'Title' column to the DataFrame
    df.survived                           # Target variable: 'survived', used to calculate mean encodings for titles
)["Title"].head()                         # Return the first five entries of the transformed 'Title' column

0    0.676923
1    0.506139
2    0.676923
3    0.162483
4    0.786802
Name: Title, dtype: float64

* Function Definition (`get_title`): Extracts titles (e.g., "Mr", "Mrs") from names in the 'name' column using a regular expression.

* Target Encoding: The `TargetEncoder` calculates the average survival rate per title category and encodes the 'Title' column based on these averages.

* `fit_transform` Execution: Temporarily adds the 'Title' column to df and applies the target encoding to produce mean survival probabilities for each title.

In [56]:
from pandas.api.types import is_numeric_dtype  # Import a function to check if a column is numeric

def fix_missing(df, col, name, na_dict):              # Define a function to handle missing values
    if is_numeric_dtype(col):                         # Check if the column is of numeric data type
        if pd.isnull(col).sum() or (name in na_dict): # If there are missing values or column is in 'na_dict'
            df[name + "_na"] = pd.isnull(col)         # Create a new column to indicate where NaNs were present
            filler = (                                # Determine the value to replace NaNs with
                na_dict[name]                         # If 'name' exists in 'na_dict', use its value
                if name in na_dict                    # Otherwise, calculate the median of the column
                else col.median()
            )
            df[name] = col.fillna(filler)             # Fill NaN values in the column with the filler value
            na_dict[name] = filler                    # Update 'na_dict' with the filler used for this column
    return na_dict                                    # Return the updated 'na_dict'

data = pd.DataFrame({"A": [0, None, 5, 100]})         # Create a DataFrame with some NaN values

fix_missing(data, data.A, "A", {})                    # Call 'fix_missing' to handle NaNs in column 'A'

data                                                  # Display the updated DataFrame

Unnamed: 0,A,A_na
0,0.0,False
1,5.0,True
2,5.0,False
3,100.0,False


In [57]:
# Pandas Version

data = pd.DataFrame({"A": [0, None, 5, 100]})  # Create a DataFrame with some NaN values

data["A_na"] = data.A.isnull()                 # Create a new column 'A_na' indicating where NaN values are present in column 'A'
data["A"] = data.A.fillna(data.A.median())     # Fill NaN values in column 'A' with the median value of column 'A'

data                                           # Display the updated DataFrame

Unnamed: 0,A,A_na
0,0.0,False
1,5.0,True
2,5.0,False
3,100.0,False


### Manual Feature Engineering

In [59]:
# Begin creating an aggregated DataFrame from 'df'
agg = (
    df.groupby("cabin")     # Group the original DataFrame 'df' by the 'cabin' column
    .agg({col: ["min", "max", "mean", "sum"] for col in df.select_dtypes(include="number").columns}) # Aggregate using min, ... for numeric columns only
    .reset_index()          # Reset the index to flatten the DataFrame structure after grouping
)

agg.columns = [  # Rename columns in 'agg' to clearly indicate the aggregation type (min, max, mean, sum) for each column
    "_".join(c).strip("_")  # Combine multi-level column names into a single string with '_', removing any leading/trailing underscores
    for c in agg.columns.values
]

# Merge the aggregated DataFrame back to the original DataFrame on 'cabin'
agg_df = df.merge(agg, on="cabin")

agg_df.head() # Display this new Dataframe

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,...,parch_mean,parch_sum,fare_min,fare_max,fare_mean,fare_sum,body_min,body_max,body_mean,body_sum
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,...,0.5,1,211.3375,211.3375,211.3375,422.675,,,,0.0
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,...,2.0,8,151.55,151.55,151.55,606.2,135.0,135.0,135.0,135.0
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,...,2.0,8,151.55,151.55,151.55,606.2,135.0,135.0,135.0,135.0
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,...,2.0,8,151.55,151.55,151.55,606.2,135.0,135.0,135.0,135.0
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,...,2.0,8,151.55,151.55,151.55,606.2,135.0,135.0,135.0,135.0
