In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.datasets import load_iris

In [None]:
data = load_iris()
df = pd.DataFrame(data.data, columns=data.feature_names)
print(df.head())
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Data Scaling

Using to scale numerical data to have a mean of 0 and a standard deviation of 1.

 This is important for algorithms that use distance-based metrics, such as k-Nearest Neighbors (KNN) and Support Vector Machines (SVM).

In [None]:
scaler = StandardScaler()

df_transformed = scaler.fit_transform(X)
df_transformed = pd.DataFrame(df_transformed, columns=X.columns)
print(df_transformed.head())

In [None]:
# MibMaxScaler - Scales the data to a fixed range, usually 0 to 1.
min_max_scaler = MinMaxScaler(feature_range=(0, 1))

df_transformed = min_max_scaler.fit_transform(X)
df_transformed = pd.DataFrame(df_transformed, columns=X.columns)
print(df_transformed.head())

# Data Encoding

In [None]:
df = pd.DataFrame(
    {'sales': [1000, 2000, 22000, 50000,40000] * 5,
     'city': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'] * 5,
     'size': ['small', 'medium', 'large', 'large', 'medium'] * 5,
    }
)
print(df.head())
# Using by pandas get_dummies function
pd_one_hot_encoded = pd.get_dummies(df, columns=["city"])
# Or we can use the following code
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False).set_output(transform='pandas')

one_hot_encoded = ohe.fit_transform(df[['city']])
print(one_hot_encoded.head())

df = pd.concat([df, one_hot_encoded], axis=1).drop(columns=['city'], axis=1)
print(df.head())

# Column Encoders

In [None]:

# The main difference between `OrdinalEncoder` and `LabelEncoder` from `sklearn.preprocessing` lies in their intended use and functionality:

# 1. **`OrdinalEncoder`**:
#    - **Purpose**: Encodes categorical features as ordinal integers. It is used for encoding multiple categorical features at once.
#    - **Input**: Takes a 2D array (e.g., a DataFrame with multiple columns).
#    - **Output**: Transforms each categorical feature to an array of integers.
#    - **Use Case**: Suitable for encoding multiple categorical columns in a DataFrame.

df = pd.DataFrame({
   'city': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'],
   'size': ['small', 'medium', 'large', 'large', 'medium'],
    'sales': [1000, 2000, 22000, 50000,40000]
})

encoder = OrdinalEncoder()
original_df = df.copy()
cat_columns = df.select_dtypes(include=['object']).columns
original_df[cat_columns] = encoder.fit_transform(original_df[cat_columns])

print(original_df)

# 2. **`LabelEncoder`**:
#    - **Purpose**: Encodes target labels with value between 0 and n_classes-1. It is used for encoding a single column of labels.
#    - **Input**: Takes a 1D array (e.g., a single column of a DataFrame).
#    - **Output**: Transforms the labels to an array of integers.
#    - **Use Case**: Suitable for encoding the target variable or a single categorical column.

df = pd.DataFrame({
   'city': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'],
    'sales': [1000, 2000, 22000, 50000,40000]
})

encoder = LabelEncoder()
lab_df = df.copy()
lab_df['city'] = encoder.fit_transform(lab_df['city'])
print(lab_df)

### Summary:
# - **`OrdinalEncoder`**: Encodes multiple categorical features in a 2D array.
# - **`LabelEncoder`**: Encodes a single categorical feature or target labels in a 1D array.

# Handling Missing Data (Imputation)

In [3]:
from sklearn.impute import SimpleImputer
import numpy as np

miles = pd.DataFrame({"farthest_run_mi": [50, 62, np.nan, 100, 26, 13, 31, 50]})

imp_mean = SimpleImputer(strategy='mean')
print(imp_mean.fit_transform(miles))

imp_median = SimpleImputer(strategy='median')
print(imp_median.fit_transform(miles))

imp_mode = SimpleImputer(strategy='most_frequent')
print(imp_mode.fit_transform(miles))

imp_constant = SimpleImputer(strategy='constant', fill_value=0)
print(imp_constant.fit_transform(miles))

[[ 50.        ]
 [ 62.        ]
 [ 47.42857143]
 [100.        ]
 [ 26.        ]
 [ 13.        ]
 [ 31.        ]
 [ 50.        ]]
[[ 50.]
 [ 62.]
 [ 50.]
 [100.]
 [ 26.]
 [ 13.]
 [ 31.]
 [ 50.]]
[[ 50.]
 [ 62.]
 [ 50.]
 [100.]
 [ 26.]
 [ 13.]
 [ 31.]
 [ 50.]]
[[ 50.]
 [ 62.]
 [  0.]
 [100.]
 [ 26.]
 [ 13.]
 [ 31.]
 [ 50.]]


In [4]:
names = pd.DataFrame({"name": ["John", "Paul", np.nan, "George", "Ringo", "Pete", "Stuart", np.nan]})

imp_constant = SimpleImputer(strategy='constant', fill_value="Unknown")
print(imp_constant.fit_transform(names))

[['John']
 ['Paul']
 ['Unknown']
 ['George']
 ['Ringo']]


In [5]:
imp_mean_marked = SimpleImputer(strategy='mean', add_indicator=True)
marked_df = pd.DataFrame(imp_mean_marked.fit_transform(miles), columns=["farthest_run_mi", "farthest_run_mi_missing"])
print(marked_df)

   farthest_run_mi  farthest_run_mi_missing
0        50.000000                      0.0
1        62.000000                      0.0
2        47.428571                      1.0
3       100.000000                      0.0
4        26.000000                      0.0
5        13.000000                      0.0
6        31.000000                      0.0
7        50.000000                      0.0


In [13]:
from sklearn.compose import make_column_transformer

names_and_miles = pd.concat([names, miles], axis=1)
print(names_and_miles)
ct = make_column_transformer(
    (imp_constant, ["name"]), # impute missing values in the 'name' column
    (imp_mean, ["farthest_run_mi"]), # impute missing values in the 'farthest_run_mi' column
    remainder='passthrough', # passthrough the remaining columns
    verbose_feature_names_out=False # leave original names
)
ct.set_output(transform='pandas')

df_pandas = ct.fit_transform(names_and_miles)
print(df_pandas)

     name  farthest_run_mi
0    John             50.0
1    Paul             62.0
2     NaN              NaN
3  George            100.0
4   Ringo             26.0
5     NaN             13.0
6     NaN             31.0
7     NaN             50.0
      name  farthest_run_mi
0     John        50.000000
1     Paul        62.000000
2  Unknown        47.428571
3   George       100.000000
4    Ringo        26.000000
5  Unknown        13.000000
6  Unknown        31.000000
7  Unknown        50.000000


In [14]:
df = pd.DataFrame(
    {'sales': [1000, 2000, 22000, 50000,40000] * 5,
     'city': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'] * 5,
     'size': ['small', 'medium', 'large', 'large', 'medium'] * 5,
    }
)

ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
ode = OrdinalEncoder()

ct1 = make_column_transformer(
    (ohe, ['city']),
    (ode, ['size']),
    remainder='passthrough', # leave the remaining columns as they are
    verbose_feature_names_out=False
)
ct1.set_output(transform='pandas')

df_pandas = ct1.fit_transform(df)
print(df_pandas)

    city_Chicago  city_Houston  city_Los Angeles  city_New York  city_Phoenix  \
0            0.0           0.0               0.0            1.0           0.0   
1            0.0           0.0               1.0            0.0           0.0   
2            1.0           0.0               0.0            0.0           0.0   
3            0.0           1.0               0.0            0.0           0.0   
4            0.0           0.0               0.0            0.0           1.0   
5            0.0           0.0               0.0            1.0           0.0   
6            0.0           0.0               1.0            0.0           0.0   
7            1.0           0.0               0.0            0.0           0.0   
8            0.0           1.0               0.0            0.0           0.0   
9            0.0           0.0               0.0            0.0           1.0   
10           0.0           0.0               0.0            1.0           0.0   
11           0.0           0

In [15]:
ct2 = make_column_transformer(
    (ohe, ['city']),
    (ode, ['size']),
    remainder='drop', # drop the remaining columns
    verbose_feature_names_out=False
)
ct2.set_output(transform='pandas')

df_pandas = ct2.fit_transform(df)
print(df_pandas)

    city_Chicago  city_Houston  city_Los Angeles  city_New York  city_Phoenix  \
0            0.0           0.0               0.0            1.0           0.0   
1            0.0           0.0               1.0            0.0           0.0   
2            1.0           0.0               0.0            0.0           0.0   
3            0.0           1.0               0.0            0.0           0.0   
4            0.0           0.0               0.0            0.0           1.0   
5            0.0           0.0               0.0            1.0           0.0   
6            0.0           0.0               1.0            0.0           0.0   
7            1.0           0.0               0.0            0.0           0.0   
8            0.0           1.0               0.0            0.0           0.0   
9            0.0           0.0               0.0            0.0           1.0   
10           0.0           0.0               0.0            1.0           0.0   
11           0.0           0

In [None]:
# Data imputation using a KNN imputer

from sklearn.impute import KNNImputer # import the KNNImputer class

miles = pd.DataFrame({"farthest_run_mi": [50, 62, np.nan, 100, 26, 13, 31, 50]})

knn_imputer = KNNImputer(n_neighbors=2) # use 2 nearest neighbors to impute missing values

result = knn_imputer.fit_transform(miles) # impute missing values in the 'farthest_run_mi' column

miles['farthest_run_mi'] = result # update the original DataFrame