In [None]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")

np.random.seed(42)


def createdata():
    data = {
        "Age": np.random.randint(18, 70, size=20),
        "Salary": np.random.randint(30000, 120000, size=20),
        "Purchased": np.random.choice([0, 1], size=20),
        "Gender": np.random.choice(["Male", "Female"], size=20),
        "City": np.random.choice(["New York", "San Francisco", "Los Angeles"], size=20),
    }

    df = pd.DataFrame(data)
    return df


df = createdata()
df.head(10)

In [None]:
df.shape

# When you have missing value in data


In [None]:
# Introduce some missing values for demonstration
df.loc[5, "Age"] = np.nan
df.loc[10, "Salary"] = np.nan
df.head(10)

In [None]:
# Simple Imputation (mean, median, mode) -
# Pros: Fast and straightforward.
# Cons: May introduce bias if missing data isn’t random. --Unfold data science mising data treatment
df["Age"].fillna(df["Age"].mean(), inplace=True)
df["Salary"].fillna(df["Salary"].median(), inplace=True)
df.head(10)

In [None]:
# Listwise deletion
# pros : Simple .
# Cons - loosing data
df_dropped = df.dropna()
df_dropped.head(10)

In [None]:
df_dropped.shape

In [None]:
# Predictive Imputation - seach for MICE imputation as well -
# Pros: More accurate for complex patterns.
# Cons: Computationally intensive, may add noise with high variance.
from sklearn.impute import KNNImputer

knn_imputer = KNNImputer(n_neighbors=3)
df[["Age", "Salary"]] = knn_imputer.fit_transform(df[["Age", "Salary"]])

In [None]:
df.head(10)

In [None]:
# Add indicator for missingness
# Pros: Allows the model to learn patterns of missingness.
# Cons: Can increase dimensionality and complexity.
df["Age_missing"] = df["Age"].isnull().astype(int)
df.head(10)

# When you have Categorical Variables in data


In [None]:
# Label Encoding
# Pros: Simple and space-efficient.
# Cons: Implies ordinal relationship, which may mislead models.
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df["Gender"] = label_encoder.fit_transform(df["Gender"])
df.head()

In [None]:
# One-Hot Encoding
# Pros: Great for non-ordinal categories; maintains all category information.
# Cons: Increases dimensionality, particularly with many unique values.#
df = pd.get_dummies(df, columns=["City"], drop_first=True)
df.head()

In [None]:
df = createdata()

In [None]:
# Ordinal Encoding
# Pros: Effective for ordinal data; captures order.
# Cons: Not suitable for non-ordinal data, as it implies ranking.
df["City"] = df["City"].map({"New York": 1, "San Francisco": 2, "Los Angeles": 3})
df.head()

In [None]:
# Target Encoding
# Pros: Useful for high-cardinality features.
# Cons: Can cause data leakage if target encoding is not done properly.
mean_target = df.groupby("City")["Purchased"].mean()
df["City_encoded"] = df["City"].map(mean_target)
df.head()

# When you need to scale features


In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

# Standardization (mean=0, variance=1)
# Pros: Works well for normally distributed data; suitable for many models.
# Cons: Sensitive to outliers.
scaler = StandardScaler()
df[["Age", "Salary"]] = scaler.fit_transform(df[["Age", "Salary"]])
df.head()

In [None]:
df = createdata()
df.head(5)

In [None]:
# Normalization (range 0-1)
# Pros: Keeps all data between 0 and 1; ideal for distance-based models.
# Cons: Can distort data distribution, especially with extreme outliers.
normalizer = MinMaxScaler()
df[["Age", "Salary"]] = normalizer.fit_transform(df[["Age", "Salary"]])
df.head()

In [None]:
df = createdata()
df.head(5)

In [None]:
# Robust Scaling (uses median and IQR, robust to outliers)
# Pros: More robust to outliers by using median and IQR.
# Cons: Doesn’t work as well for normal distributions.
robust_scaler = RobustScaler()
df[["Age", "Salary"]] = robust_scaler.fit_transform(df[["Age", "Salary"]])
df.head(5)

# When you have outliers in data


In [None]:
df = createdata()
df.head(5)

In [None]:
# Outlier Detection and Treatment using IQR
# Pros: Simple and effective for mild outliers.
# Cons: May overly reduce variation if there are many extreme outliers.
Q1 = df["Salary"].quantile(0.25)
Q3 = df["Salary"].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df["Salary"] = np.where(
    df["Salary"] > upper_bound,
    upper_bound,
    np.where(df["Salary"] < lower_bound, lower_bound, df["Salary"]),
)

In [None]:
df = createdata()
df.head(5)

In [None]:
# Z-score method
# Pros: Good for normally distributed data.
# Cons: Not suitable for non-normal data; may miss outliers in skewed distributions.
from scipy import stats

df["Salary_zscore"] = stats.zscore(df["Salary"])
df["Salary"] = np.where(
    df["Salary_zscore"].abs() > 3, np.nan, df["Salary"]
)  # Replace outliers with NaN

In [None]:
df = createdata()
df.head(5)

In [None]:
# Median replacement for outliers
# Pros: Keeps distribution shape intact, useful when capping isn’t feasible.
# Cons: May distort data if outliers represent real phenomena.
df["Salary_zscore"] = stats.zscore(df["Salary"])
median_salary = df["Salary"].median()
df["Salary"] = np.where(df["Salary_zscore"].abs() > 3, median_salary, df["Salary"])
df.head(5)

# Feature Engineering


In [None]:
df = createdata()
df.head(5)

In [None]:
# Creating a new feature based on Salary
# Pros: Simplified continuous features; useful in non-linear models.
# Cons: Can lose data granularity, potentially reducing model accuracy.
df["Income_Level"] = pd.cut(
    df["Salary"], bins=[0, 50000, 100000, 150000], labels=["Low", "Medium", "High"]
)
df.head(10)

In [None]:
# Polynomial Features
# Pros: Captures complex relationships between variables.
# Cons: Increases dimensionality, risking overfitting.
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly.fit_transform(df[["Age", "Salary"]])
df_poly = pd.DataFrame(
    poly_features, columns=poly.get_feature_names_out(["Age", "Salary"])
)
df = pd.concat([df, df_poly], axis=1)
df.head()