# TABULAR DATA PREPROCESSING

## 1. Imports and settings

Import required libraries and configure display options.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', 200)
%matplotlib inline

## 2. Load dataset


In [None]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

## 3. Initial inspection

In [None]:
df_train.head()

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
df_train.describe(include='object')

In [None]:
df_train.isna().sum()

In [None]:
df_train.duplicated().sum()

In [None]:
df_train.drop_duplicates(inplace=True)

In [None]:
print(len(df_train['target'].unique()))
print(df_train['target'].value_counts())

## 4. Split dataset

In [None]:
X = df_train.drop("target", axis=1)
y = df_train["target"]
X_test = df_test

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print("X Train set size:", X_train.shape)
print("X Validation set size:", X_val.shape)
print("X Test set size:", X_test.shape)

print("y Train set size:", y_train.shape)
print("y Validation set size:", y_val.shape)

## 5. Missing Value Handling


### 5.1 Detecting Hidden Missing Data

In [None]:
def detect_hidden_missing_data(df, suspicious_values=None, target_column=None):
    """
    Detects missing data in the DataFrame for all columns or a specific column,
    including standard NaN/None and custom suspicious values.

    Parameters:
    - df: DataFrame containing the data.
    - suspicious_values: List of custom values to treat as missing (e.g., [-999, 'unknown', '']).
                         Defaults to None (only checks NaN/None).
    - target_column: String, name of a specific column to check for missing data.
                    If None, checks all columns. Defaults to None.

    Returns:
    - A DataFrame summarizing missing counts and percentages for the selected column(s).
    """
    if suspicious_values is None:
        suspicious_values = []

    # Create a copy to avoid modifying the original
    temp_df = df.copy()

    # Validate target_column
    if target_column is not None:
        if target_column not in temp_df.columns:
            print(f"Error: Column '{target_column}' not found in DataFrame.")
            return pd.DataFrame()
        temp_df = temp_df[[target_column]]
        print(f"Checking missing data for column: {target_column}")
    else:
        print("Checking missing data for all columns")

    # Replace suspicious values with NaN
    for val in suspicious_values:
        temp_df = temp_df.replace(val, np.nan)

    # Calculate missing counts and percentages
    missing_counts = temp_df.isnull().sum()
    missing_percentages = (missing_counts / len(temp_df)) * 100
    missing_summary = pd.DataFrame({
        'Missing Count': missing_counts,
        'Missing Percentage (%)': missing_percentages.round(2)
    }).sort_values(by='Missing Count', ascending=False)

    # Filter summary to show only columns with missing data
    missing_summary = missing_summary[missing_summary['Missing Count'] > 0]

    # Print summary
    if not missing_summary.empty:
        print("Missing Data Summary:")
        print(missing_summary)
    else:
        print("No missing data detected in the selected column(s).")

    return missing_summary

In [None]:
# suspicious = [-999, 'unknown', '', 'N/A']  # Customize based on your data

# # Check for a specific column
# missing_summary_train = detect_hidden_missing_data(df_train, suspicious_values=suspicious, target_column='column_name')
# missing_summary_val = detect_hidden_missing_data(X_val, suspicious_values=suspicious, target_column='column_name')
# missing_summary_test = detect_hidden_missing_data(df_test, suspicious_values=suspicious, target_column='column_name')

# # Check for all columns
# missing_summary_all_train = detect_hidden_missing_data(df_train, suspicious_values=suspicious)
# missing_summary_all_val = detect_hidden_missing_data(X_val, suspicious_values=suspicious)
# missing_summary_all_test = detect_hidden_missing_data(df_test, suspicious_values=suspicious)

### 5.2 Show Unique Values and Percentages of Missing Values

In [None]:
def missing_and_unique_info(df):
    missing_count = df.isnull().sum()
    missing_percent = round((missing_count / len(df)) * 100, 2)

    cols_with_missing = missing_count[missing_count > 0].index

    if len(cols_with_missing) == 0:
        print("any columns with missing values")
        return pd.DataFrame()

    non_null_count = df[cols_with_missing].notnull().sum()

    unique_counts = pd.Series(index=cols_with_missing, dtype='object')
    for col in cols_with_missing:
        if df[col].dtype == 'object' or pd.api.types.is_categorical_dtype(df[col]):
            unique_counts[col] = df[col].nunique(dropna=True)
        else:
            unique_counts[col] = np.nan

    report = pd.DataFrame({
        'Missing %': missing_percent[cols_with_missing],
        'Non-null Count': non_null_count,
        'Unique (for object/categorical)': unique_counts
    })

    report.index.name = 'Feature'
    return report

In [None]:
result_train = missing_and_unique_info(X_train)
result_val = missing_and_unique_info(X_val)
result_test = missing_and_unique_info(X_test)

print(result_train)
print(result_val)
print(result_test)

### 5.3 Drop Columns
Use drop_columns when you want to drop entire useless columns from the DataFrame.

In [None]:
def drop_columns(df, columns):
    """Drop specified columns from the DataFrame."""
    df.drop(columns=columns, inplace=True)

In [None]:
# drop_columns(df_train, ['col1', 'col2'])
# drop_columns(df_val, ['col1', 'col2'])
# drop_columns(df_test, ['col1', 'col2'])

### 5.4 Drop All Missing Row

In [None]:
def drop_missing(df, target_column=None):
    """
    Drops rows with missing values from the DataFrame, optionally filtering first by a target column.

    Parameters:
    - df: pandas.DataFrame containing the data.
    - target_column: str or None. If provided, rows where df[target_column] is null are first removed,
        then a global dropna() is applied to the DataFrame. If None, dropna() is applied to the entire DataFrame.

    Returns:
    - pandas.DataFrame: a new DataFrame with rows containing missing values removed.
    """
    if target_column is not None:
        df = df[df[target_column].notnull()]
    return df.dropna()

In [None]:
## Drop rows with missing values from the special columns of training DataFrame
# X_train = drop_missing(X_train, target_column=['col1', 'col2'])
# X_val = drop_missing(X_val, target_column=['col1', 'col2'])

# Drop rows with missing values from the entire training DataFrame
# X_train = drop_missing(X_train)
# X_val = drop_missing(X_val)

### 5.5 Numerical Imputation
Use fill_numerical_cols for numerical features when distribution is stable.

In [None]:
def fill_numerical_cols(df, num_cols, strategy="mean"):
    """Fill missing with mean, median or mode."""
    for col in num_cols:
        if strategy == "mean":
            df[col] = df[col].fillna(df[col].mean())
        elif strategy == "median":
            df[col] = df[col].fillna(df[col].median())
        elif strategy == "mode":
            df[col] = df[col].fillna(df[col].mode().iloc[0])
    return df

In [None]:
# num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
# X_train = fill_numerical_cols(X_train, num_col=num_cols, strategy="mean")
# X_val = fill_numerical_cols(X_val, num_col=num_cols, strategy="mean")
# X_test = fill_numerical_cols(X_test, num_col=num_cols, strategy="mean")

### 5.6 Categorical Imputation
Use fill_categorical for categorical features.

In [None]:
def fill_categorical(df, cat_cols):
    """Fill missing categorical values with mode."""
    for col in cat_cols:
        df[col] = df[col].fillna(df[col].mode()[0])
    return df

In [None]:
# cat_cols = X_train.select_dtypes(include=['object']).columns
# X_train = fill_categorical(X_train, cat_cols=cat_cols)
# X_val = fill_categorical(X_val, cat_cols=cat_cols)
# X_test = fill_categorical(X_test, cat_cols=cat_cols)

### 5.7 Forward and Backward Fill
Use interpolation for time-series or continuous numeric data.

In [None]:
def fill_with_ffill_bfill(df, columns, method="ffill"):
    """Fill using forward fill or backward fill."""
    df[columns] = df[columns].fillna(method=method)
    return df

In [None]:
# X_train = fill_with_ffill_bfill(X_train, columns=['num_col1', 'num_col2'], method="ffill")
# X_val = fill_with_ffill_bfill(X_val, columns=['num_col1', 'num_col2'], method="ffill")
# X_test = fill_with_ffill_bfill(X_test, columns=['num_col1', 'num_col2'], method="ffill")

### 5.8 Imputation Techniques

#### 5.8.1 KNN Imputation

In [None]:
from sklearn.impute import KNNImputer

numeric_columns = ['num_col1', 'num_col2']
imputer = KNNImputer(n_neighbors=3)

X_train[numeric_columns] = imputer.fit_transform(X_train[numeric_columns])
X_val[numeric_columns] = imputer.transform(X_val[numeric_columns])
X_test[numeric_columns] = imputer.transform(X_test[numeric_columns])

#### 5.8.2 Iterative Imputation

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.impute import IterativeImputer

numeric_columns = X_train.select_dtypes(include=['float64', 'int64']).columns.tolist()

imputer = IterativeImputer(
    estimator=LinearRegression(),
    max_iter=10,
    random_state=42,
    skip_complete=True
)

X_train[numeric_columns] = imputer.fit_transform(X_train[numeric_columns])
X_val[numeric_columns] = imputer.transform(X_val[numeric_columns])
X_test[numeric_columns] = imputer.transform(X_test[numeric_columns])

In [None]:
# numeric_columns = X_train.select_dtypes(include=['float64', 'int64']).columns.tolist()

# X_train = fill_with_iterative_imputer(X_train, numeric_columns, is_train=True, verbose=True)
# X_val = fill_with_iterative_imputer(X_val, numeric_columns, is_train=False, verbose=True)
# X_test = fill_with_iterative_imputer(X_test, numeric_columns, is_train=False, verbose=True)

## 6. Exploratory Data Analysis (EDA)

Visualize numerical and categorical distributions, boxplots for outliers, and correlation heatmap.


In [None]:
numeric_columns = X_train.select_dtypes(include=np.number).columns
categorical_columns = X_train.select_dtypes(include=['object','category','bool']).columns

### 6.1 Numerical features distribution

In [2]:
def plot_numeric_distributions(df, num_cols, bins=50):
    for col in num_cols:
        plt.figure(figsize=(8, 4))
        sns.histplot(df[col].dropna(), kde=True, bins=bins)
        plt.title(f'Distribution: {col}')
        plt.show()

In [None]:
# plot_numeric_distributions(X_train, numeric_columns)

### 6.2 Categorical features count

In [None]:
def plot_categorical_counts(df, cat_cols, top_n=20):
    for col in cat_cols:
        plt.figure(figsize=(6,4))
        sns.countplot(y=df[col], order=df[col].value_counts().index[:top_n])
        plt.title(f'Counts: {col}')
        plt.show()

In [None]:
# plot_categorical_counts(X_train, categorical_columns)

### 6.3 Correlation heatmap

In [None]:
def correlation_heatmap(df, num_cols):
    corr = df[num_cols].corr()
    plt.figure(figsize=(10,8))
    sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
    plt.title('Correlation Heatmap')
    plt.show()

In [None]:
# correlation_heatmap(X_train, numeric_columns)

### 6.4 Pre Processing Date Columns

#### 6.4.1 Date Column Parsing

In [None]:
def parse_datetime_columns(df, date_cols, format=None):
    """Convert specified columns to datetime type."""
    df = df.copy()
    for col in date_cols:
        df[col] = pd.to_datetime(df[col], format=format, errors='coerce')
    return df

In [None]:
# date_cols = ['timestamp', 'date', 'created_at']
# X_train = parse_datetime(X_train, date_cols, format='%Y-%m-%d %H:%M:%S')
# X_val = parse_datetime(X_val, date_cols, format='%Y-%m-%d %H:%M:%S')
# X_test = parse_datetime(X_test, date_cols, format='%Y-%m-%d %H:%M:%S')

#### 6.4.2 Impute Time Series

In [None]:
time_cols = ['timestamp', 'datetime', 'date']

##### 6.4.2.1 ffill & bfill

In [None]:
for col in time_cols:
    X_train[col] = X_train[col].fillna(method='ffill')
    X_val[col] = X_val[col].fillna(method='ffill')
    X_test[col] = X_test[col].fillna(method='ffill')

##### 6.4.2.2 KNN Imputation

In [None]:
imputer = KNNImputer(n_neighbors=3)

X_train = imputer.fit_transform(X_train)
X_val = imputer.transform(X_val)
X_test = imputer.transform(X_test)

### 6.5 Handling Skewed Numerical Features

To handle skewed numerical features, check skewness (|skew| > 1 is significant) and apply a transformation if needed:

- **Log**: `np.log1p()` for positive, right-skewed data (handles zeros).  
- **Square root**: `np.sqrt()` for moderate right-skew.  
- **Box-Cox**: from `scipy.stats.boxcox` (requires positive values).  
- **Yeo-Johnson**: from `sklearn.preprocessing.PowerTransformer` (works with zeros/negatives).

#### 6.5.1 Log Transformation

In [None]:
from scipy.stats import skew

def apply_log_transform_to_all(X_train, X_val, X_test, skew_threshold=1.0, plot=True):
    """
    Applies log1p transformation to skewed numerical columns (decided by X_train)
    and returns transformed X_train, X_val, X_test.

    Parameters:
    - X_train, X_val, X_test: DataFrames
    - skew_threshold: threshold for |skew| to consider a column skewed (default: 1.0)
    - plot: if True, shows before/after histograms for transformed columns (default: True)

    Returns:
    - Transformed (X_train, X_val, X_test)
    """
    X_train_orig = X_train.copy()

    numeric_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()

    skewed_cols = []
    for col in numeric_cols:
        col_skew = skew(X_train[col].dropna())
        if abs(col_skew) > skew_threshold and (X_train[col] >= 0).all():
            skewed_cols.append(col)
            if plot:
                print(f"Applying log1p to '{col}' (skew = {col_skew:.2f})")

    if not skewed_cols:
        if plot:
            print("No columns meet the skewness and positivity criteria for log transform.")
        return X_train, X_val, X_test

    def _log_transform(df, cols):
        df = df.copy()
        for col in cols:
            df[col] = np.log1p(df[col])
        return df

    X_train_new = _log_transform(X_train, skewed_cols)
    X_val_new = _log_transform(X_val, skewed_cols)
    X_test_new = _log_transform(X_test, skewed_cols)

    if plot:
        for col in skewed_cols:
            fig, axes = plt.subplots(1, 2, figsize=(12, 4))
            sns.histplot(X_train_orig[col].dropna(), kde=True, ax=axes[0], color='skyblue')
            axes[0].set_title(f'Original: {col}\nSkew = {skew(X_train_orig[col].dropna()):.2f}')

            sns.histplot(X_train_new[col].dropna(), kde=True, ax=axes[1], color='lightgreen')
            axes[1].set_title(f'Log-transformed: {col}\nSkew = {skew(X_train_new[col].dropna()):.2f}')

            plt.tight_layout()
            plt.show()

    return X_train_new, X_val_new, X_test_new

In [None]:
# X_train, X_val, X_test = apply_log_transform_to_all(
#     X_train, X_val, X_test,
#     skew_threshold=1.0,
#     plot=True
# )

#### 6.5.2 Squar Root Transformation

In [None]:
def apply_sqrt_transform_to_all(X_train, X_val, X_test, skew_threshold=1.0, plot=True):
    """
    Applies sqrt transformation to skewed numerical columns (decided by X_train)
    and returns transformed X_train, X_val, X_test.
    """
    X_train_orig = X_train.copy()
    numeric_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()

    skewed_cols = []
    for col in numeric_cols:
        col_skew = skew(X_train[col].dropna())
        if abs(col_skew) > skew_threshold and (X_train[col] >= 0).all():
            skewed_cols.append(col)
            if plot:
                print(f"Applying sqrt to '{col}' (skew = {col_skew:.2f})")

    if not skewed_cols:
        if plot:
            print("No columns meet criteria for sqrt transform.")
        return X_train, X_val, X_test

    def _sqrt_transform(df, cols):
        df = df.copy()
        for col in cols:
            df[col] = np.sqrt(df[col])
        return df

    X_train_new = _sqrt_transform(X_train, skewed_cols)
    X_val_new = _sqrt_transform(X_val, skewed_cols)
    X_test_new = _sqrt_transform(X_test, skewed_cols)

    if plot:
        for col in skewed_cols:
            fig, axes = plt.subplots(1, 2, figsize=(12, 4))
            sns.histplot(X_train_orig[col].dropna(), kde=True, ax=axes[0], color='skyblue')
            axes[0].set_title(f'Original: {col}\nSkew = {skew(X_train_orig[col].dropna()):.2f}')

            sns.histplot(X_train_new[col].dropna(), kde=True, ax=axes[1], color='lightgreen')
            axes[1].set_title(f'Sqrt-transformed: {col}\nSkew = {skew(X_train_new[col].dropna()):.2f}')

            plt.tight_layout()
            plt.show()

    return X_train_new, X_val_new, X_test_new

In [None]:
# X_train, X_val, X_test = apply_sqrt_transform_to_all(
#     X_train, X_val, X_test,
#     skew_threshold=1.0,
#     plot=True
# )

#### 6.5.3 Boxcox Transformation

In [None]:
from scipy.stats import skew, boxcox

def apply_boxcox_to_all(X_train, X_val, X_test, skew_threshold=1.0, plot=True):
    """
    Applies Box-Cox transformation to skewed numerical columns.
    Decision based on X_train; same λ used for all.
    """
    X_train_orig = X_train.copy()
    numeric_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()

    lambdas = {}
    cols_to_transform = []

    for col in numeric_cols:
        col_skew = skew(X_train[col].dropna())
        if (X_train[col] <= 0).any():
            if plot:
                print(f"Skipping '{col}': contains non-positive values.")
            continue

        if abs(col_skew) > skew_threshold:
            try:
                _, fitted_lambda = boxcox(X_train[col])
                lambdas[col] = fitted_lambda
                cols_to_transform.append(col)
                if plot:
                    print(f"Applying Box-Cox to '{col}' (skew={col_skew:.2f}, λ={fitted_lambda:.3f})")
            except Exception as e:
                if plot:
                    print(f"Failed to apply Box-Cox to '{col}': {e}")
                continue

    if not cols_to_transform:
        if plot:
            print("No columns transformed with Box-Cox.")
        return X_train, X_val, X_test

    def _apply_boxcox_with_lambda(df, col, lam):
        df = df.copy()
        if abs(lam) < 1e-6:
            df[col] = np.log(df[col])
        else:
            df[col] = (np.power(df[col], lam) - 1) / lam
        return df

    for col in cols_to_transform:
        lam = lambdas[col]
        X_train = _apply_boxcox_with_lambda(X_train, col, lam)
        X_val = _apply_boxcox_with_lambda(X_val, col, lam)
        X_test = _apply_boxcox_with_lambda(X_test, col, lam)

    if plot:
        for col in cols_to_transform:
            fig, axes = plt.subplots(1, 2, figsize=(12, 4))
            sns.histplot(X_train_orig[col].dropna(), kde=True, ax=axes[0], color='skyblue')
            axes[0].set_title(f'Original: {col}\nSkew = {skew(X_train_orig[col].dropna()):.2f}')

            sns.histplot(X_train[col].dropna(), kde=True, ax=axes[1], color='lightgreen')
            axes[1].set_title(f'Box-Cox: {col}\nSkew = {skew(X_train[col].dropna()):.2f}')

            plt.tight_layout()
            plt.show()

    return X_train, X_val, X_test

In [None]:
# X_train, X_val, X_test = apply_boxcox_to_all(
#     X_train, X_val, X_test,
#     skew_threshold=1.0,
#     plot=True
# )

#### 6.5.4 Yeo Johson Transformation

In [None]:
from sklearn.preprocessing import PowerTransformer

numeric_columns = X_train.select_dtypes(include=[np.number]).columns.tolist()

skewed_cols = []
for col in numeric_columns:
    if abs(skew(X_train[col].dropna())) > 1.0:
        skewed_cols.append(col)

print("Skewed columns to transform:", skewed_cols)

pt = PowerTransformer(method='yeo-johnson', standardize=False)

In [None]:
# X_train[skewed_cols] = pt.fit_transform(X_train[skewed_cols])
# X_val[skewed_cols] = pt.transform(X_val[skewed_cols])
# X_test[skewed_cols] = pt.transform(X_test[skewed_cols])

## 7. Outlier Detection & Treatment

Implement IQR-based and Z-score methods.

### 7.1 Boxplot to check outliers

In [None]:
def plot_boxplots(df, num_cols):
    for col in num_cols:
        plt.figure(figsize=(6,2))
        sns.boxplot(x=df[col].dropna())
        plt.title(f'Boxplot: {col}')
        plt.show()

In [None]:
# plot_boxplots(X_train, numeric_columns)
# plot_boxplots(X_val, numeric_columns)
# plot_boxplots(X_test, numeric_columns)

### 7.2 IQR Outlier Removal

In [None]:
def remove_outliers_iqr(df, num_cols, k=1.5, verbose=True):
    df = df.copy()
    for c in num_cols:
        Q1 = df[c].quantile(0.25)
        Q3 = df[c].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - k * IQR
        upper = Q3 + k * IQR
        before = len(df)
        df = df[(df[c] >= lower) & (df[c] <= upper)]
        after = len(df)
        if verbose:
            print(f"Column {c}: removed {before-after} rows using IQR (k={k})")
    return df

In [None]:
# remove_outliers_iqr(X_train, numeric_columns)
# remove_outliers_iqr(X_val, numeric_columns)
# remove_outliers_iqr(X_test, numeric_columns)

### 7.3 Z-Score Outlier Removal

In [None]:
import scipy.stats as stats
def remove_outliers_zscore(df, num_cols, z_thresh=3.0, verbose=True):
    """
    Remove rows with outliers in specified numerical columns using Z-score.

    Parameters:
    - df: Input DataFrame
    - num_cols: str or list of column names (e.g., 'age' or ['age', 'income'])
    - z_thresh: Z-score threshold (default: 3.0)
    - verbose: Print number of removed rows (default: True)

    Returns:
    - DataFrame with outliers removed
    """
    df = df.copy()

    if isinstance(num_cols, str):
        num_cols = [num_cols]

    z_scores = np.abs(stats.zscore(df[num_cols], nan_policy='omit'))
    mask = (z_scores < z_thresh).all(axis=1)
    df_clean = df[mask]

    if verbose:
        print(f"Removed {len(df) - len(df_clean)} rows using Z-score threshold {z_thresh} on columns: {num_cols}")

    return df_clean

In [None]:
# X_train = remove_outliers_zscore(X_train, numeric_columns)
# X_val = remove_outliers_zscore(X_test, numeric_columns)
# X_test = remove_outliers_zscore(X_train, numeric_columns)

### 7.4 Handling with Robust Scaler

It is useful when the data contains outliers that cannot or should not be removed.

In [None]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()

numeric_columns = X_train.select_dtypes(include=['number']).columns.tolist()

X_train[numeric_columns] = scaler.fit_transform(X_train[numeric_columns])
X_val[numeric_columns] = scaler.transform(X_val[numeric_columns])
X_test[numeric_columns] = scaler.transform(X_test[numeric_columns])

## 8. Feature Engineering

Examples: ratio features, date extraction, interaction terms.


### 8.1 Ratio Feature

In [None]:
def add_ratio_feature(df, numerator, denominator, new_name=None):
    df = df.copy()
    new_name = new_name or f"{numerator}_over_{denominator}"
    df[new_name] = df[numerator] / (df[denominator].replace(0, np.nan) + 1e-9)
    return df

In [None]:
# df = add_ratio_feature(df, 'feature1', 'feature2', new_name='feature_ratio')

### 8.2 Date Extraction

In [None]:
def extract_time_features(df, date_cols):
    """Extract time-based features from datetime index."""
    df = df.copy()
    for col in date_cols:
        df['year'] = df[col].dt.year
        df['month'] = df[col].dt.month
        df['day'] = df[col].dt.day
        df['hour'] = df[col].dt.hour
        df['weekday'] = df[col].dt.weekday
        df['is_weekend'] = df[col].dt.weekday >= 5
    return df

In [None]:
# date_cols = ['timestamp', 'date', 'created_at']
# X_train = extract_time_features(X_train, date_cols)
# X_val = extract_time_features(X_val, date_cols)
# X_test = extract_time_features(X_test, date_cols)

## 9. Feature Selection

Selecting the most important features improves model performance and reduces overfitting.

### 9.1 SelectKBest (Univariate Selection)

Selects top K features based on statistical tests.

- chi2 → for non-negative features (e.g., counts, frequencies).
- f_classif → for continuous numerical features in classification problems.
- Useful for quick filtering before model training.

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

k = 10
selector = SelectKBest(score_func=f_classif, k=k)

X_train_selected = selector.fit_transform(X_train, y_train)
X_val_selected = selector.transform(X_val)
X_test_selected = selector.transform(X_test)

selected_features = X_train.columns[selector.get_support()]
print("Selected features:", selected_features.tolist())

### 9.2 Recursive Feature Elimination (RFE)

Iteratively trains a model and removes the least important features.

- More computationally expensive.
- Works best when you have a moderate number of features (< 100).
- Can be used with any estimator that exposes a coef_ or feature_importances_ attribute

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# Define model
model = LogisticRegression(max_iter=1000)

# Apply RFE
rfe = RFE(estimator=model, n_features_to_select=10)
rfe.fit(X_train, y_train)

selected_features_rfe = X_train.columns[rfe.support_]
print("Selected Features using RFE:")
print(selected_features_rfe)

### 9.3 Feature Importance (Tree-based Models)

Uses built-in feature importance scores from tree-based models (e.g., Random Forest, XGBoost).

- Works only with tree-based models
- Captures non-linear relationships.
- Provides insights into feature relationships and importance.


In [None]:
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import numpy as np

# Train a Random Forest model
rf = RandomForestClassifier(random_state=42)
rf.fit(X, y)

# Get feature importances
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]
selected_features_rf = X.columns[indices[:10]]

print("Top 10 Important Features (Random Forest):")
print(selected_features_rf)

# Plot feature importances
plt.figure(figsize=(8, 5))
plt.barh(X.columns[indices[:10]], importances[indices[:10]])
plt.gca().invert_yaxis()
plt.title("Top 10 Feature Importances (Random Forest)")
plt.xlabel("Importance Score")
plt.show()


## 10. Encoding Categorical Variables

### 10.1 One-Hot Encoding
Best for categorical features without ordinal relationship (e.g. color, city).

In [None]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False, drop='first')
color_encoded_train = encoder.fit_transform(X_train[['color']])
color_encoded_val = encoder.transform(X_val[['color']])
color_encoded_test = encoder.transform(X_test[['color']])

# drop original categorical columns
X_train = X_train.drop(columns=['color'])
X_val = X_val.drop(columns=['color'])
X_test = X_test.drop(columns=['color'])

# concatenate the encoded features with the original dataframe
X_train = X_train.join(pd.DataFrame(color_encoded_train, columns=encoder.get_feature_names_out(['color'])))
X_val = X_val.join(pd.DataFrame(color_encoded_val, columns=encoder.get_feature_names_out(['color'])))
X_test = X_test.join(pd.DataFrame(color_encoded_test, columns=encoder.get_feature_names_out(['color'])))

### 10.2 Ordinal Encoding
Best for ordinal categorical features (e.g. education level).

In [None]:
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder(categories=[['High School','Bachelor','Master','PhD']])
X_train['education_encoded'] = encoder.fit_transform(X_train[['education']])
X_train.drop('education', axis=1, inplace=True)

X_val['education_encoded'] = encoder.transform(X_val[['education']])
X_val.drop('education', axis=1, inplace=True)

X_test['education_encoded'] = encoder.transform(X_test[['education']])
X_test.drop('education', axis=1, inplace=True)

### 10.3 Label Encoding
Best for labeling column that are categorical(y_train, y_val and y _test)

In [None]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

# 1. fit on y_train
le = LabelEncoder()
y_train = le.fit_transform(y_train)

# 2. safe function
def safe_transform(le, y, unknown_value=-1):
    mapping = {label: idx for idx, label in enumerate(le.classes_)}
    return np.array([mapping.get(label, unknown_value) for label in y])

# 3. Apply safe function on y_val
y_val = safe_transform(le, y_val)

# 4. (Optional) Check for unknown labels in validation
unknown_in_val = set(y_val) - set(le.classes_)
if unknown_in_val:
    print(f"کلاس‌های ناشناخته در validation: {unknown_in_val}")

### 10.4 Frequency Encoding

Used for categorical columns with many unique values (high cardinality).

In [None]:
freq = X_train['category'].value_counts(normalize=True)
X_train['category_freq_enc'] = X_train['category'].map(freq)
X_train = X_train.drop(columns=['category'])

X_val['category_freq_enc'] = X_val['category'].map(freq)
X_val = X_val.drop(columns=['category'])

X_test['category_freq_enc'] = X_test['category'].map(freq)
X_test = X_test.drop(columns=['category'])

### 10.5 MultiLabelBinarizer

In [None]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

def preprocess_col(df, col, sep):
    return (
        df[col]
        .fillna('')
        .astype(str)
        .str.split(sep)
        .apply(lambda x: [label.strip() for label in x if label.strip()])
    )

col = 'genres'
sep = '|'
prefix = 'genre'

X_train[col] = preprocess_col(X_train, col, sep)
X_val[col] = preprocess_col(X_val, col, sep)
X_test[col] = preprocess_col(X_test, col, sep)

mlb = MultiLabelBinarizer()
mlb.fit(X_train[col])

for df in [X_train, X_val, X_test]:
    binarized = pd.DataFrame(
        mlb.transform(df[col]),
        columns=[f'{prefix}_{lbl}' for lbl in mlb.classes_],
        index=df.index
    )
    df.drop(columns=[col], inplace=True)
    df[binarized.columns] = binarized

## 11. Numerical Feature Scaling

Choose scaler depending on data distribution:


### 11.1 StandardScaler

Useful when features follow a **Gaussian distribution**.

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train[numeric_columns] = scaler.fit_transform(X_train[numeric_columns])
X_val[numeric_columns] = scaler.transform(X_val[numeric_columns])
X_test[numeric_columns] = scaler.transform(X_test[numeric_columns])

### 11.2 MinMaxScaler

Useful when features have **different scales** but known **min/max ranges**.

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train["name_of_num_col"] = scaler.fit_transform(X_train["name_of_num_col"])
X_val["name_of_num_col"] = scaler.transform(X_val["name_of_num_col"])
X_test["name_of_num_col"] = scaler.transform(X_test["name_of_num_col"])

### 11.3 RobustScaler

Useful for data with **outliers**.

In [None]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
X_train['name_of_num_col'] = scaler.fit_transform(X_train['name_of_num_col'])
X_val['name_of_num_col'] = scaler.transform(X_val['name_of_num_col'])
X_test['name_of_num_col'] = scaler.transform(X_test['name_of_num_col'])

### 11.4 Log Transformation

For **skewed data** to make it more normal.

In [None]:
import numpy as np
X_train['name_of_num_col'] = np.log1p(X_train['name_of_num_col'])
X_val['name_of_num_col'] = np.log1p(X_val['name_of_num_col'])
X_test['name_of_num_col'] = np.log1p(X_test['name_of_num_col'])

## 12. Model Training

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

print("Validation Accuracy:", accuracy_score(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))


### 12.1 Supervised Learning

- **Logistic Regression**  
  Description: Linear model for binary or multiclass classification; fast, interpretable, assumes linear boundaries.  
  Usage:  
  ```python
  from sklearn.linear_model import LogisticRegression
  model = LogisticRegression(random_state=42)
  model.fit(X_train, y_train)
  ```

- **Decision Tree**  
  Description: Tree-based model capturing non-linear relationships; easy to visualize, prone to overfitting.  
  Usage:  
  ```python
  from sklearn.tree import DecisionTreeClassifier
  model = DecisionTreeClassifier(random_state=42, max_depth=5)
  model.fit(X_train, y_train)
  ```

- **Random Forest**  
  Description: Ensemble of decision trees (bagging); robust, reduces overfitting, good default choice.  
  Usage:  
  ```python
  from sklearn.ensemble import RandomForestClassifier
  model = RandomForestClassifier(random_state=42, n_estimators=100)
  model.fit(X_train, y_train)
  ```

- **Gradient Boosting (sklearn)**  
  Description: Sequential tree boosting; high accuracy, needs tuning, captures complex patterns.  
  Usage:  
  ```python
  from sklearn.ensemble import GradientBoostingClassifier
  model = GradientBoostingClassifier(random_state=42, n_estimators=100)
  model.fit(X_train, y_train)
  ```

- **XGBoost**  
  Description: Optimized gradient boosting; fast, scalable, excels on tabular data, requires tuning.  
  Usage:  
  ```python
  from xgboost import XGBClassifier
  model = XGBClassifier(random_state=42, n_estimators=100, eval_metric='logloss')
  model.fit(X_train, y_train)
  ```

- **Support Vector Machine (SVM)**  
  Description: Effective in high-dimensional spaces with kernel trick for non-linear boundaries; needs scaling.  
  Usage:  
  ```python
  from sklearn.svm import SVC
  model = SVC(kernel='rbf', C=1.0, probability=True)
  model.fit(X_train, y_train)
  ```

- **K-Nearest Neighbors (KNN)**  
  Description: Non-parametric, instance-based; simple, slow at prediction, sensitive to scaling.  
  Usage:  
  ```python
  from sklearn.neighbors import KNeighborsClassifier
  model = KNeighborsClassifier(n_neighbors=5)
  model.fit(X_train, y_train)
  ```

- **Naive Bayes (Gaussian)**  
  Description: Probabilistic, assumes feature independence; fast, works well for text or small datasets.  
  Usage:  
  ```python
  from sklearn.naive_bayes import GaussianNB
  model = GaussianNB()
  model.fit(X_train, y_train)
  ```

- **Multilayer Perceptron (MLP)**  
  Description: Feedforward neural network for non-linear mappings; needs tuning and scaling, computationally intensive.  
  Usage:  
  ```python
  from sklearn.neural_network import MLPClassifier
  model = MLPClassifier(hidden_layer_sizes=(100,), random_state=42, max_iter=300)
  model.fit(X_train, y_train)
  ```

- **Recurrent Neural Network (RNN)**  
  Description: Neural network for sequential data; captures temporal dependencies, suitable for time series or text.  
  Usage (Keras):  
  ```python
  from tensorflow.keras.models import Sequential
  from tensorflow.keras.layers import SimpleRNN, Dense
  model = Sequential([
      SimpleRNN(50, input_shape=(X_train.shape[1], X_train.shape[2]), activation='relu'),
      Dense(1, activation='sigmoid')
  ])
  model.compile(optimizer='adam', loss='binary_crossentropy')
  model.fit(X_train, y_train, epochs=10, batch_size=32)
  ```

- **Long Short-Term Memory (LSTM)**  
  Description: Advanced RNN variant; handles long-term dependencies, ideal for complex sequential data.  
  Usage (Keras):  
  ```python
  from tensorflow.keras.models import Sequential
  from tensorflow.keras.layers import LSTM, Dense
  model = Sequential([
      LSTM(50, input_shape=(X_train.shape[1], X_train.shape[2]), activation='relu'),
      Dense(1, activation='sigmoid')
  ])
  model.compile(optimizer='adam', loss='binary_crossentropy')
  model.fit(X_train, y_train, epochs=10, batch_size=32)
  ```

- **Linear Regression / Ridge / Lasso** (Regression)  
  Description: Linear models for continuous targets; Ridge/Lasso add regularization to prevent overfitting.  
  Usage:  
  ```python
  from sklearn.linear_model import Ridge
  model = Ridge(alpha=1.0)
  model.fit(X_train, y_train)
  ```

- **Support Vector Regressor (SVR)**  
  Description: Non-linear regression with kernels; robust but sensitive to scaling.  
  Usage:  
  ```python
  from sklearn.svm import SVR
  model = SVR(kernel='rbf', C=1.0)
  model.fit(X_train, y_train)
  ```

### 12.2 Unsupervised Learning



- **K-Means (Clustering)**  
  Description: Partitions data into k clusters by centroid assignment; fast, assumes spherical clusters.  
  Usage:  
  ```python
  from sklearn.cluster import KMeans
  km = KMeans(n_clusters=3, random_state=42).fit(X)
  labels = km.labels_
  ```

- **DBSCAN (Clustering)**  
  Description: Density-based clustering; finds arbitrary-shaped clusters, handles noise, no need to specify k.  
  Usage:  
  ```python
  from sklearn.cluster import DBSCAN
  db = DBSCAN(eps=0.5, min_samples=5).fit(X)
  labels = db.labels_
  ```

- **Isolation Forest (Anomaly Detection)**  
  Description: Tree-based anomaly detection; isolates outliers by random splits, effective for high-dimensional data.  
  Usage:  
  ```python
  from sklearn.ensemble import IsolationForest
  iso = IsolationForest(contamination=0.1, random_state=42).fit(X)
  anomalies = iso.predict(X)  # -1 for outliers, 1 for inliers
  ```

- **Autoencoder (Dimensionality Reduction / Anomaly Detection)**  
  Description: Neural network for unsupervised feature learning; compresses data or detects anomalies via reconstruction error.  
  Usage (Keras):  
  ```python
  from tensorflow.keras.models import Sequential
  from tensorflow.keras.layers import Dense
  model = Sequential([
      Dense(32, activation='relu', input_shape=(X.shape[1],)),
      Dense(16, activation='relu'),
      Dense(32, activation='relu'),
      Dense(X.shape[1], activation='sigmoid')
  ])
  model.compile(optimizer='adam', loss='mse')
  model.fit(X, X, epochs=10, batch_size=32)
  ```

- **PCA (Dimensionality Reduction)**  
  Description: Linear projection to principal components; used for compression or visualization.  
  Usage:  
  ```python
  from sklearn.decomposition import PCA
  pca = PCA(n_components=2).fit(X)
  X_reduced = pca.transform(X)
  ```
    ```

Notes: choose models based on problem type (classification/regression), data size, feature scaling, interpretability needs, and compute budget. Always validate with cross-validation and tune hyperparameters.

## 13.Model Evaluation Metrics

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

### 13.1 Regression Metrics

- **Mean Squared Error (MSE)**  
  Description: Average of squared differences between predictions and actual values; penalizes larger errors heavily.  
  Usage:  
  ```python
  mse = mean_squared_error(y_true, y_pred)
  print(f"MSE: {mse:.4f}")
  ```

- **Root Mean Squared Error (RMSE)**  
  Description: Square root of MSE; interpretable in the same units as the target.  
  Usage:  
  ```python
  rmse = np.sqrt(mean_squared_error(y_true, y_pred))
  print(f"RMSE: {rmse:.4f}")
  ```

- **Mean Absolute Error (MAE)**  
  Description: Average of absolute differences; less sensitive to outliers than MSE.  
  Usage:  
  ```python
  mae = mean_absolute_error(y_true, y_pred)
  print(f"MAE: {mae:.4f}")
  ```

- **R² Score (Coefficient of Determination)**  
  Description: Proportion of variance explained by the model; ranges from 0 to 1 (higher is better).  
  Usage:  
  ```python
  r2 = r2_score(y_true, y_pred)
  print(f"R² Score: {r2:.4f}")
  ```

- **Residual Plot**  
  Description: Visualizes prediction errors (residuals) to assess model fit; ideally, residuals are randomly scattered around 0.  
  Usage:  
  ```python
  residuals = y_true - y_pred
  plt.figure(figsize=(8, 5))
  plt.scatter(y_pred, residuals, alpha=0.5)
  plt.axhline(y=0, color='r', linestyle='--')
  plt.xlabel('Predicted Values')
  plt.ylabel('Residuals')
  plt.title('Residual Plot')
  plt.show()
  ```

### 13.2 Classification Metrics

- **Accuracy**  
  Description: Proportion of correct predictions; good for balanced datasets.  
  Usage:  
  ```python
  accuracy = accuracy_score(y_true, y_pred)
  print(f"Accuracy: {accuracy:.4f}")
  ```

- **Precision**  
  Description: Proportion of true positives among positive predictions; useful when false positives are costly.  
  Usage:  
  ```python
  precision = precision_score(y_true, y_pred, average='weighted')  # Use 'binary' for binary classification
  print(f"Precision: {precision:.4f}")
  ```

- **Recall (Sensitivity)**  
  Description: Proportion of true positives identified; critical when false negatives are costly.  
  Usage:  
  ```python
  recall = recall_score(y_true, y_pred, average='weighted')  # Use 'binary' for binary classification
  print(f"Recall: {recall:.4f}")
  ```

- **F1 Score**  
  Description: Harmonic mean of precision and recall; balances both for imbalanced datasets.  
  Usage:  
  ```python
  f1 = f1_score(y_true, y_pred, average='weighted')  # Use 'binary' for binary classification
  print(f"F1 Score: {f1:.4f}")
  ```

- **ROC-AUC Score**  
  Description: Area under the ROC curve; measures model's ability to distinguish classes (binary/multiclass). Requires probability scores.  
  Usage:  
  ```python
  roc_auc = roc_auc_score(y_true, y_pred_proba, multi_class='ovr')  # Use y_pred_proba for probabilities
  print(f"ROC-AUC: {roc_auc:.4f}")
  ```

- **Confusion Matrix**  
  Description: Table showing true vs. predicted class counts; helps visualize classification performance.  
  Usage:  
  ```python
  cm = confusion_matrix(y_true, y_pred)
  plt.figure(figsize=(6, 4))
  sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
  plt.xlabel('Predicted')
  plt.ylabel('True')
  plt.title('Confusion Matrix')
  plt.show()
  ```

## 14. Handling Imbalanced Data

In [None]:
import numpy as np
import pandas as pd
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids, TomekLinks, EditedNearestNeighbours
import matplotlib.pyplot as plt
import seaborn as sns

### 15.1 Oversampling Techniques

- **Random Oversampling**  
  Description: Randomly duplicates samples from the minority class to balance the dataset; simple but risks overfitting.  
  Usage:  
  ```python
  ros = RandomOverSampler(random_state=42)
  X_resampled, y_resampled = ros.fit_resample(X, y)
  print("Class distribution after Random Oversampling:", pd.Series(y_resampled).value_counts())
  ```

- **SMOTE (Synthetic Minority Oversampling Technique)**  
  Description: Generates synthetic samples for the minority class by interpolating between existing samples; reduces overfitting compared to random oversampling.  
  Usage:  
  ```python
  smote = SMOTE(random_state=42)
  X_resampled, y_resampled = smote.fit_resample(X, y)
  print("Class distribution after SMOTE:", pd.Series(y_resampled).value_counts())
  ```

- **ADASYN (Adaptive Synthetic Sampling)**  
  Description: Similar to SMOTE but focuses synthetic samples on harder-to-classify regions (near class boundaries); good for complex datasets.  
  Usage:  
  ```python
  adasyn = ADASYN(random_state=42)
  X_resampled, y_resampled = adasyn.fit_resample(X, y)
  print("Class distribution after ADASYN:", pd.Series(y_resampled).value_counts())
  ```

### 15.2 Undersampling Techniques

- **Random Undersampling**  
  Description: Randomly removes samples from the majority class to balance the dataset; simple but may discard valuable data.  
  Usage:  
  ```python
  rus = RandomUnderSampler(random_state=42)
  X_resampled, y_resampled = rus.fit_resample(X, y)
  print("Class distribution after Random Undersampling:", pd.Series(y_resampled).value_counts())
  ```

- **Cluster Centroids**  
  Description: Replaces majority class samples with centroids of clusters; preserves structure but may oversimplify data.  
  Usage:  
  ```python
  cc = ClusterCentroids(random_state=42)
  X_resampled, y_resampled = cc.fit_resample(X, y)
  print("Class distribution after Cluster Centroids:", pd.Series(y_resampled).value_counts())
  ```

- **Tomek Links**  
  Description: Removes majority class samples that are closest to minority class samples (links); improves class separation.  
  Usage:  
  ```python
  tl = TomekLinks()
  X_resampled, y_resampled = tl.fit_resample(X, y)
  print("Class distribution after Tomek Links:", pd.Series(y_resampled).value_counts())
  ```

- **Edited Nearest Neighbors (ENN)**  
  Description: Removes majority class samples misclassified by their k-nearest neighbors; cleans noisy data.  
  Usage:  
  ```python
  enn = EditedNearestNeighbours()
  X_resampled, y_resampled = enn.fit_resample(X, y)
  print("Class distribution after ENN:", pd.Series(y_resampled).value_counts())
  ```