# Dealing with missing data

In [None]:
print("""
@Description: Dealing with missing data
@Author(s): Stephen CUI
@LastEditor(s): Stephen CUI
@CreatedTime: 2023-06-06 17:25:58
""")

In [None]:
import pandas as pd
from io import StringIO

In [None]:
csv_data = '''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''
df = pd.read_csv(StringIO(csv_data))
df.isnull().sum()

In [None]:
df.values

## Eliminating training examples or features with missing values

In [None]:
df.dropna(axis=0)

In [None]:
df.dropna(axis=1)

In [None]:
df.dropna(how='all')

In [None]:
# drop rows that have fewer than 4 real values
df.dropna(thresh=4, axis=0)

In [None]:
# only drop rows where NaN appear in specific columns (here: 'C')
df.dropna(subset=['C'])

## Imputing missing values

In [None]:
from sklearn.impute import SimpleImputer
import numpy as np
imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imr.fit(df.values)
imputed_data = imr.transform(df.values)
imputed_data

In [None]:
df.fillna(df.mean())

## Understanding the scikit-learn estimator API

# Handling categorical data

## Categorical data encoding with pandas

In [None]:
df = pd.DataFrame([
    ['green', 'M', 10.1, 'class2'],
    ['red', 'L', 13.5, 'class1'],
    ['blue', 'XL', 15.3, 'class2']])

df.columns = ['color', 'size', 'price', 'classlabel']
df

## Mapping ordinal features

In [None]:
size_mapping = {'XL':3, 'L':2, 'M': 1}

In [None]:
df['size'] = df['size'].map(size_mapping)
df

In [None]:
inv_size_mapping = {v: k for (k, v) in size_mapping.items()}
df['size'].map(inv_size_mapping)

## Encoding class labels

In [None]:
import numpy as np
class_mapping = {label: idx for idx, label in enumerate(np.unique(df['classlabel']))}
class_mapping

In [None]:
df['classlabel'] = df['classlabel'].map(class_mapping)
df

In [None]:
inv_class_mapping = {v: k for (k, v) in class_mapping.items()}
df['classlabel'] = df['classlabel'].map(inv_class_mapping)
df

In [None]:
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
y = class_le.fit_transform(df['classlabel'].values)
y

In [None]:
class_le.inverse_transform(y)

## Performing one-hot encoding on nominal features

In [None]:
X = df[['color', 'size', 'price']].values
color_le = LabelEncoder()
X[:, 0] = color_le.fit_transform(X[:, 0])
X

In [None]:
from sklearn.preprocessing import OneHotEncoder
X = df[['color', 'size', 'price']].values
color_ohe = OneHotEncoder()
color_ohe.fit_transform(X[:, 0].reshape(-1, 1)).toarray()

In [None]:
from sklearn.compose import ColumnTransformer
X = df[['color', 'size', 'price']].values
c_transf = ColumnTransformer([
    ('onehot', OneHotEncoder(), [0]),
    ('nothing', 'passthrough', [1, 2])
])
c_transf.fit_transform(X)

In [None]:
pd.get_dummies(df[['color', 'size', 'price']])

In [None]:
pd.get_dummies(df[['color', 'size', 'price']],
               drop_first=True).astype('float')

In [None]:
color_ohe = OneHotEncoder(categories='auto', drop='first')
c_transf = ColumnTransformer([
    ('onehot', color_ohe, [0]),
    ('nothing', 'passthrough', [1, 2])
])
c_transf.fit_transform(X).astype(float)

## Optional: encoding ordinal features

In [None]:
df = pd.DataFrame([
    ['green', 'M', 10.1, 'class2'],
    ['red', 'L', 13.5, 'class1'],
    ['blue', 'XL', 15.3, 'class2']])

df.columns = ['color', 'size', 'price', 'classlabel']
df

In [None]:
df['x > M'] = df['size'].apply(lambda x: 1 if x in {'L', 'XL'} else 0)
df['x > L'] = df['size'].apply(lambda x: 1 if x == 'XL' else 0)
del df['size']
df

# Partitioning a dataset into separate training and test datasets

In [1]:
import sys
sys.path.append('./')
sys.path.append('../')

In [2]:
from data.get_data import DataLoader
dl = DataLoader()
df_wine = dl.get_iris()