# Asynchronous Lecture 02

## Data Handling Challenges in Machine Learning

### CIS432 Machine Learning for Business Analytics


In [None]:
# Load dataset
import pandas as pd
pipeline_data_url = 'https://drive.google.com/uc?id=1GBKVDRkYP8CwqSHSXGmaPRJ0fD1b_WpO&export=download'
df_pipeline = pd.read_csv(pipeline_data_url)
df_pipeline

Unnamed: 0,age,income,gender,region,target
0,25.0,50000.0,male,north,0
1,30.0,60000.0,female,south,1
2,35.0,,female,east,1
3,,45000.0,male,west,0
4,45.0,70000.0,,north,1
5,50.0,80000.0,female,,0
6,29.0,,male,east,1
7,,40000.0,male,south,0
8,33.0,75000.0,female,west,1
9,40.0,55000.0,,north,0


In [None]:
df_train = df_pipeline.iloc[:5,:] # first 5 rows
df_test = df_pipeline.iloc[5:,:]  # last 5 rows

# Transforming values manually

In [None]:
# Compute the mean for each numeric column
df_train.select_dtypes(include='number').agg(['mean','std'])

Unnamed: 0,age,income,target
mean,33.75,56250.0,0.6
std,8.539126,11086.778913,0.547723


In [None]:
# Compute the mode (most frequent value) for each non-numeric column
df_train.select_dtypes(exclude='number').mode().iloc[0]

Unnamed: 0,0
gender,female
region,north


In [None]:
# Step 1: Impute missing values
# Numeric columns: Impute with mean
df = df_train.copy()

df['age'] = df['age'].fillna(df['age'].mean())
df['income'] = df['income'].fillna(df['income'].mean())

# Categorical columns: Impute with mode
df['gender'] = df['gender'].fillna(df['gender'].mode()[0])
df['region'] = df['region'].fillna(df['region'].mode()[0])

# Step 2: Scale numeric features (Z-score normalization)
numeric_features = ['age', 'income']
df[numeric_features] = (df[numeric_features] - df[numeric_features].mean()) / df[numeric_features].std()

# Step 3: One-hot encode categorical features
categorical_features = ['gender', 'region']
df_encoded_manual = pd.get_dummies(df, columns=categorical_features, drop_first=False, dtype=float)

# Resulting dataframe
print(df_encoded_manual)


        age    income  target  gender_female  gender_male  region_east  \
0 -1.183216 -0.650945       0            0.0          1.0          0.0   
1 -0.507093  0.390567       1            1.0          0.0          0.0   
2  0.169031  0.000000       1            1.0          0.0          1.0   
3  0.000000 -1.171700       0            0.0          1.0          0.0   
4  1.521278  1.432078       1            1.0          0.0          0.0   

   region_north  region_south  region_west  
0           1.0           0.0          0.0  
1           0.0           1.0          0.0  
2           0.0           0.0          0.0  
3           0.0           0.0          1.0  
4           1.0           0.0          0.0  


* How to transform `df_test` or a new observation?
* What if we have 10 steps in the process?
* What if we want to try slightly different preprocessing?

<br><br>

# Transforming values using pipelines

In [None]:
X_train = df_train.iloc[:,:-1]
Y_train = df_train.iloc[:,-1]
X_test = df_test.iloc[:,:-1]
Y_test = df_test.iloc[:,-1]

In [None]:
# Define pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

# Example pipeline
numeric_features = ['age', 'income']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_features = ['gender', 'region']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [None]:
# Apply transformation
X__train_transformed = preprocessor.fit_transform(X_train)

In [None]:
X__train_transformed

array([[-1.32287566, -0.72777814,  0.        ,  1.        ,  0.        ,
         1.        ,  0.        ,  0.        ],
       [-0.56694671,  0.43666688,  1.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [ 0.18898224,  0.        ,  1.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ],
       [ 0.        , -1.31000065,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [ 1.70084013,  1.6011119 ,  1.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ]])

In [None]:
# Present output as a pandas DataFrame
pd.DataFrame(X__train_transformed)

Unnamed: 0,0,1,2,3,4,5,6,7
0,-1.322876,-0.727778,0.0,1.0,0.0,1.0,0.0,0.0
1,-0.566947,0.436667,1.0,0.0,0.0,0.0,1.0,0.0
2,0.188982,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,0.0,-1.310001,0.0,1.0,0.0,0.0,0.0,1.0
4,1.70084,1.601112,1.0,0.0,0.0,1.0,0.0,0.0


In [None]:
# Extract feature names
def get_feature_names(preprocessor, categorical_features):
    numeric_names = numeric_features  # Numeric feature names stay the same
    cat_transformer = preprocessor.named_transformers_['cat']
    if isinstance(cat_transformer, Pipeline):
        cat_transformer = cat_transformer.steps[-1][1]  # Get OneHotEncoder

    cat_names = list(cat_transformer.get_feature_names_out(categorical_features))
    return numeric_names + cat_names

# Fitting the pipeline and getting the feature names
updated_feature_names = get_feature_names(preprocessor, categorical_features)

df_encoded_pipeline = pd.DataFrame(columns=updated_feature_names, data=X__train_transformed)
df_encoded_pipeline

Unnamed: 0,age,income,gender_female,gender_male,region_east,region_north,region_south,region_west
0,-1.322876,-0.727778,0.0,1.0,0.0,1.0,0.0,0.0
1,-0.566947,0.436667,1.0,0.0,0.0,0.0,1.0,0.0
2,0.188982,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,0.0,-1.310001,0.0,1.0,0.0,0.0,0.0,1.0
4,1.70084,1.601112,1.0,0.0,0.0,1.0,0.0,0.0


In [None]:
# How to transform the test set?
X_test_transformed = preprocessor.transform(X_test) # IMPROTANT: TRANSFORM AND NOT FIT_TRANSFORM
df_encoded_test = pd.DataFrame(columns=updated_feature_names, data=X_test_transformed)
df_encoded_test

Unnamed: 0,age,income,gender_female,gender_male,region_east,region_north,region_south,region_west
0,2.456769,2.765557,1.0,0.0,0.0,1.0,0.0,0.0
1,-0.718132,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,0.0,-1.892223,0.0,1.0,0.0,0.0,1.0,0.0
3,-0.113389,2.183334,1.0,0.0,0.0,0.0,0.0,1.0
4,0.944911,-0.145556,1.0,0.0,0.0,1.0,0.0,0.0


#### Comments
* Internally, the pipeline estimated the means and standard deviations of each feature to apply the transformation (albeit a bit differently - 0/1 degree of freedom when computing the standard deviation, which wouldn't matter in practice)
* We can access these values

In [None]:
# Get the numeric imputer's estimated values
numeric_imputer_values = preprocessor.named_transformers_['num'].named_steps['imputer'].statistics_

# Get the categorical imputer's estimated values
categorical_imputer_values = preprocessor.named_transformers_['cat'].named_steps['imputer'].statistics_

# Print the results
print("Numeric Imputer Estimated Values:", numeric_imputer_values)
print("Categorical Imputer Estimated Values:", categorical_imputer_values)

# Access the fitted scaler from the preprocessor
scaler = preprocessor.named_transformers_['num'].named_steps['scaler']

# Get the mean values estimated by the scaler
scaler_means = scaler.mean_

# Get the scale values (standard deviation) estimated by the scaler
scaler_scales = scaler.scale_

# Print the results
print("Scaler Estimated Means:", scaler_means)
print("Scaler Estimated Scales (Standard Deviations):", scaler_scales)


Numeric Imputer Estimated Values: [3.375e+01 5.625e+04]
Categorical Imputer Estimated Values: ['female' 'north']
Scaler Estimated Means: [3.375e+01 5.625e+04]
Scaler Estimated Scales (Standard Deviations): [6.61437828e+00 8.58778202e+03]
