# Lecture 4: Data Preprocessing

MTU Spring 2024

Instructor: Amna Mazen

### Announcements
- Github repo for lecture code and dataset: https://github.com/MazenMTULab/ML_COURSE_RESOURCES
- Quiz 1 on Wednesday 22nd January.

### Imports

In [31]:
import sys
import time

import matplotlib.pyplot as plt

%matplotlib inline
import numpy as np
import pandas as pd
from IPython.display import HTML

sys.path.append("code/.")


from IPython.display import display
#from plotting_functions import *


# Preprocessing and pipeline
from sklearn.impute import SimpleImputer

#from utils import *
from io import StringIO
import sys


pd.set_option("display.max_colwidth", 200)

## Motivation
- Are we ready to do machine learning on real-world datasets?
    - Very often real-world datasets need preprocessing before we use them to build ML models.

### Common preprocessing techniques

Some commonly performed feature transformation include:  
- Imputation: Tackling missing values
- Scaling: Scaling of numeric features
- One-hot encoding: Tackling categorical variables      
    

We can have one lecture on each of them! In this lecture our goal is to getting familiar with them so that we can use them to build ML pipelines.

## Handling missing values

In [32]:
import pandas as pd
from io import StringIO
import sys

csv_data = \
'''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''

# Replace consecutive commas with NaN
csv_data = csv_data.replace(',,', ',NaN,')

df = pd.read_csv(StringIO(csv_data))
print(df)


      A     B     C    D
0   1.0   2.0   3.0  4.0
1   5.0   6.0   NaN  8.0
2  10.0  11.0  12.0  NaN


In [34]:
# Check for NaN values and summarize
nan_summary = df.isna().sum()

# Display summary
print("Summary of NaN values per column:")
print(nan_summary)


Summary of NaN values per column:
A    0
B    0
C    1
D    1
dtype: int64


In [9]:
# remove rows that contain missing values
df.dropna(axis=0)


Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [10]:
# remove columns that contain missing values

df.dropna(axis=1)


Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [11]:
# only drop rows where all columns are NaN
df.dropna(how='all')


Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


### Imputing missing values

In [25]:
# Impute missing values via the column mean

from sklearn.impute import SimpleImputer
import numpy as np

imr=SimpleImputer(missing_values=np.nan, strategy='mean')
imr=imr.fit(df.values)
imputed_data=imr.transform(df.values)
imputed_data


array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

In [15]:
imr=SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=100 )
imr=imr.fit(df.values)
imputed_data=imr.transform(df.values)
imputed_data

array([[  1.,   2.,   3.,   4.],
       [  5.,   6., 100.,   8.],
       [ 10.,  11.,  12., 100.]])

## Handling categorical data

In [26]:
df=pd.DataFrame([['green', 'M', 10.1, 'class2'],
                ['red', 'L', 13.5, 'class1'],
                ['blue', 'XL', 15.3, 'class2']])

df.columns=['color', 'size', 'price', 'classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [17]:
#Mapping ordinal features
size_mapping={'XL': 3,
             'L': 2,
             'M': 1}


df['size']=df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [18]:
# Encoding class labels – label encoding (I)

## Create a mapping dict
## to convert class labels from string to integers
class_mapping={label:idx for idx, label in \
              enumerate(np.unique(df['classlabel']))}
class_mapping

{'class1': 0, 'class2': 1}

In [None]:
## mapping
df['classlabel']=df['classlabel'].map(class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,1
1,red,2,13.5,0
2,blue,3,15.3,1


In [19]:
# Encoding class labels – label encoding (II)
from sklearn.preprocessing import LabelEncoder

## Label encoding with sklearn's LabelEncoder

class_label=LabelEncoder()
y=class_label.fit_transform(df['classlabel'].values)
print("Encoded class label: ",y)
print("----")
print(df)

Encoded class label:  [1 0 1]
----
   color  size  price classlabel
0  green     1   10.1     class2
1    red     2   13.5     class1
2   blue     3   15.3     class2


In [30]:
# Encoding on nominal features (size)
X=df[['color', 'size', 'price']].values
color_le=LabelEncoder()
X[:,1]=color_le.fit_transform(X[:,1])
X

array([['green', 1, 10.1],
       ['red', 0, 13.5],
       ['blue', 2, 15.3]], dtype=object)

In [29]:
# Encoding on nominal features (color)
X[:,0]=color_le.fit_transform(X[:,0])
X

array([[1, 1, 10.1],
       [2, 0, 13.5],
       [0, 2, 15.3]], dtype=object)

In [None]:
## Performing one-hot encoding on nominal features

from sklearn.preprocessing import OneHotEncoder

X=df[['color', 'size', 'price']].values
color_ohe=OneHotEncoder()
color_ohe.fit_transform(X[:,0].reshape(-1,1)).toarray()



array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [None]:
## We can select which columns to encode in a multi-feature array

from sklearn.compose import ColumnTransformer

X=df[['color', 'size', 'price']].values
c_transf=ColumnTransformer([ ('onehot', OneHotEncoder(), [0]),
                            ('nothing', 'passthrough', [1,2])
                           ])
c_transf.fit_transform(X).astype(float)

array([[ 0. ,  1. ,  0. ,  1. , 10.1],
       [ 0. ,  0. ,  1. ,  2. , 13.5],
       [ 1. ,  0. ,  0. ,  3. , 15.3]])