# TABULAR DATA PREPROCESSING

## 1. Imports and settings

Import required libraries and configure display options.


In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# Display settings
pd.set_option('display.max_columns', 200)
%matplotlib inline

## 2. Load dataset

Load a CSV file into a DataFrame. Change DATA_PATH to your file path.


In [None]:
df = pd.read_csv("tabular_data.csv")

## 3. Initial inspection

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.describe(include='object')

In [None]:
#df["col"].value_counts()

In [None]:
#df["col"].unique()

In [None]:
df.isna().sum()

In [None]:
# df['date'] = pd.to_datetime(df['date'], errors='coerce')

In [None]:
df.duplicated().sum()

## PreProcess Date data

In [None]:
def parse_datetime(df, datetime_column, format=None):
    """Convert datetime column to datetime type and set as index."""
    df = df.copy()
    df[datetime_column] = pd.to_datetime(df[datetime_column], format=format, errors='coerce')
    df.set_index(datetime_column, inplace=True)
    return df

In [None]:
# # Example
# df = parse_datetime(df, 'timestamp', format='%Y-%m-%d %H:%M:%S')

In [None]:
def extract_time_features(df):
    """Extract time-based features from datetime index."""
    df = df.copy()
    df['year'] = df.index.year
    df['month'] = df.index.month
    df['day'] = df.index.day
    df['hour'] = df.index.hour
    df['weekday'] = df.index.weekday
    df['is_weekend'] = df.index.weekday >= 5
    return df

In [None]:
# df = extract_time_features(df)

In [None]:
def impute_time_series(df, numeric_columns, method='ffill', n_neighbors=3):
    """Impute missing values in time series data."""
    df = df.copy()
    if method in ['ffill', 'bfill']:
        df[numeric_columns] = df[numeric_columns].fillna(method=method)
    elif method == 'knn':
        imputer = KNNImputer(n_neighbors=n_neighbors)
        df[numeric_columns] = imputer.fit_transform(df[numeric_columns])
    return df

In [None]:
# # Example
# df = impute_time_series(df, numeric_columns=['value'], method='ffill')

## 4. Exploratory Data Analysis (EDA)

Visualize numerical and categorical distributions, boxplots for outliers, and correlation heatmap.


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
numeric_columns = df.select_dtypes(include=np.number).columns
categorical_columns = df.select_dtypes(include=['object','category','bool']).columns

### 4.1 Numerical features distribution

In [2]:
def plot_numeric_distributions(df, num_cols, bins=50):
    for col in num_cols:
        plt.figure(figsize=(8, 4))
        sns.histplot(df[col].dropna(), kde=True, bins=bins)
        plt.title(f'Distribution: {col}')
        plt.show()

In [3]:
# plot_numeric_distributions(df, numeric_columns)

### 4.2 Handling Skewed Numerical Features

To handle skewed distributions in numerical features, you can add a section that checks the skewness and applies transformations if necessary. Skewness is typically considered significant if it's greater than 1 (right-skewed) or less than -1 (left-skewed). Common transformations include:

- **Log Transformation** (useful for right-skewed data with positive values): `np.log1p(df[col])` to handle zeros.
- **Square Root Transformation** (for moderate right-skew): `np.sqrt(df[col])`.
- **Box-Cox Transformation** (requires positive values; handles both skew directions): From `scipy.stats.boxcox`.
- **Yeo-Johnson Transformation** (handles negative/zero values): From `sklearn.preprocessing.PowerTransformer`.

In [None]:
# import numpy as np
# import seaborn as sns
# import matplotlib.pyplot as plt
# from scipy.stats import boxcox, skew
# from sklearn.preprocessing import PowerTransformer

In [None]:
def apply_log_transformation(df, num_cols, skew_threshold=1.0):
    """
    Applies log transformation to skewed numerical columns.
    
    Parameters:
    - df: DataFrame containing the data.
    - num_cols: List of numerical column names.
    - skew_threshold: Threshold for considering a distribution skewed (default: 1.0).
    
    Returns:
    - Transformed DataFrame.
    """
    transformed_df = df.copy()
    for col in num_cols:
        col_skew = skew(transformed_df[col].dropna())
        print(f"Skewness of {col}: {col_skew:.2f}")
        
        if abs(col_skew) > skew_threshold:
            if (transformed_df[col] < 0).any():
                print(f"Warning: {col} has negative values; skipping log transform.")
                continue
            print(f"Applying log transformation to {col}...")
            transformed_df[col] = np.log1p(transformed_df[col])
            
            # Recheck skewness
            new_skew = skew(transformed_df[col].dropna())
            print(f"New skewness of {col}: {new_skew:.2f}")
        
        # Plot before and after
        fig, axes = plt.subplots(1, 2, figsize=(12, 4))
        sns.histplot(df[col].dropna(), kde=True, ax=axes[0])
        axes[0].set_title(f'Original Distribution: {col}')
        sns.histplot(transformed_df[col].dropna(), kde=True, ax=axes[1])
        axes[1].set_title(f'Log Transformed Distribution: {col}')
        plt.show()
    
    return transformed_df

In [None]:
def apply_sqrt_transformation(df, num_cols, skew_threshold=1.0):
    """
    Applies square root transformation to skewed numerical columns.
    
    Parameters:
    - df: DataFrame containing the data.
    - num_cols: List of numerical column names.
    - skew_threshold: Threshold for considering a distribution skewed (default: 1.0).
    
    Returns:
    - Transformed DataFrame.
    """
    transformed_df = df.copy()
    for col in num_cols:
        col_skew = skew(transformed_df[col].dropna())
        print(f"Skewness of {col}: {col_skew:.2f}")
        
        if abs(col_skew) > skew_threshold:
            if (transformed_df[col] < 0).any():
                print(f"Warning: {col} has negative values; skipping sqrt transform.")
                continue
            print(f"Applying square root transformation to {col}...")
            transformed_df[col] = np.sqrt(transformed_df[col])
            
            # Recheck skewness
            new_skew = skew(transformed_df[col].dropna())
            print(f"New skewness of {col}: {new_skew:.2f}")
        
        # Plot before and after
        fig, axes = plt.subplots(1, 2, figsize=(12, 4))
        sns.histplot(df[col].dropna(), kde=True, ax=axes[0])
        axes[0].set_title(f'Original Distribution: {col}')
        sns.histplot(transformed_df[col].dropna(), kde=True, ax=axes[1])
        axes[1].set_title(f'Square Root Transformed Distribution: {col}')
        plt.show()
    
    return transformed_df

In [None]:
def apply_boxcox_transformation(df, num_cols, skew_threshold=1.0):
    """
    Applies Box-Cox transformation to skewed numerical columns.
    
    Parameters:
    - df: DataFrame containing the data.
    - num_cols: List of numerical column names.
    - skew_threshold: Threshold for considering a distribution skewed (default: 1.0).
    
    Returns:
    - Transformed DataFrame.
    """
    transformed_df = df.copy()
    for col in num_cols:
        col_skew = skew(transformed_df[col].dropna())
        print(f"Skewness of {col}: {col_skew:.2f}")
        
        if abs(col_skew) > skew_threshold:
            if (transformed_df[col] <= 0).any():
                print(f"Warning: {col} has non-positive values; skipping Box-Cox.")
                continue
            print(f"Applying Box-Cox transformation to {col}...")
            transformed_df[col], _ = boxcox(transformed_df[col] + 1e-6)  # Small epsilon for stability
            
            # Recheck skewness
            new_skew = skew(transformed_df[col].dropna())
            print(f"New skewness of {col}: {new_skew:.2f}")
        
        # Plot before and after
        fig, axes = plt.subplots(1, 2, figsize=(12, 4))
        sns.histplot(df[col].dropna(), kde=True, ax=axes[0])
        axes[0].set_title(f'Original Distribution: {col}')
        sns.histplot(transformed_df[col].dropna(), kde=True, ax=axes[1])
        axes[1].set_title(f'Box-Cox Transformed Distribution: {col}')
        plt.show()
    
    return transformed_df

In [None]:
def apply_yeojohnson_transformation(df, num_cols, skew_threshold=1.0):
    """
    Applies Yeo-Johnson transformation to skewed numerical columns.
    
    Parameters:
    - df: DataFrame containing the data.
    - num_cols: List of numerical column names.
    - skew_threshold: Threshold for considering a distribution skewed (default: 1.0).
    
    Returns:
    - Transformed DataFrame.
    """
    transformed_df = df.copy()
    for col in num_cols:
        col_skew = skew(transformed_df[col].dropna())
        print(f"Skewness of {col}: {col_skew:.2f}")
        
        if abs(col_skew) > skew_threshold:
            print(f"Applying Yeo-Johnson transformation to {col}...")
            pt = PowerTransformer(method='yeo-johnson', standardize=False)
            transformed_df[col] = pt.fit_transform(transformed_df[[col]]).flatten()
            
            # Recheck skewness
            new_skew = skew(transformed_df[col].dropna())
            print(f"New skewness of {col}: {new_skew:.2f}")
        
        # Plot before and after
        fig, axes = plt.subplots(1, 2, figsize=(12, 4))
        sns.histplot(df[col].dropna(), kde=True, ax=axes[0])
        axes[0].set_title(f'Original Distribution: {col}')
        sns.histplot(transformed_df[col].dropna(), kde=True, ax=axes[1])
        axes[1].set_title(f'Yeo-Johnson Transformed Distribution: {col}')
        plt.show()
    
    return transformed_df

### 4.3 Categorical features count

In [None]:
def plot_categorical_counts(df, cat_cols, top_n=20):
    for col in cat_cols:
        plt.figure(figsize=(6,4))
        sns.countplot(y=df[col], order=df[col].value_counts().index[:top_n])
        plt.title(f'Counts: {col}')
        plt.show()

In [None]:
# plot_categorical_counts(df, categorical_columns)

### 4.4 Correlation heatmap

In [None]:
def correlation_heatmap(df, num_cols):
    corr = df[num_cols].corr()
    plt.figure(figsize=(10,8))
    sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
    plt.title('Correlation Heatmap')
    plt.show()

In [None]:
# correlation_heatmap(df, numeric_columns)

### 4.5 Boxplot to check outliers

In [None]:
def plot_boxplots(df, num_cols):
    for col in num_cols:
        plt.figure(figsize=(6,2))
        sns.boxplot(x=df[col].dropna())
        plt.title(f'Boxplot: {col}')
        plt.show()

In [None]:
# plot_boxplots(df, numeric_columns)

## 5. Missing Value Handling


### 5.1 Detecting Missing Value

In [None]:
import missingno as msno
import matplotlib.pyplot as plt

def msno_show (df) :
    msno.matrix(df)
    plt.show()

    msno.bar(df)
    plt.show()



In [None]:
#msno_show(df)

In [None]:
def detect_hidden_missing_data(df, suspicious_values=None, target_column=None):
    """
    Detects missing data in the DataFrame for all columns or a specific column, 
    including standard NaN/None and custom suspicious values.
    
    Parameters:
    - df: DataFrame containing the data.
    - suspicious_values: List of custom values to treat as missing (e.g., [-999, 'unknown', '']). 
                         Defaults to None (only checks NaN/None).
    - target_column: String, name of a specific column to check for missing data. 
                    If None, checks all columns. Defaults to None.
    
    Returns:
    - A DataFrame summarizing missing counts and percentages for the selected column(s).
    """
    if suspicious_values is None:
        suspicious_values = []
    
    # Create a copy to avoid modifying the original
    temp_df = df.copy()
    
    # Validate target_column
    if target_column is not None:
        if target_column not in temp_df.columns:
            print(f"Error: Column '{target_column}' not found in DataFrame.")
            return pd.DataFrame()
        temp_df = temp_df[[target_column]]  # Focus on single column
        print(f"Checking missing data for column: {target_column}")
    else:
        print("Checking missing data for all columns")
    
    # Replace suspicious values with NaN
    for val in suspicious_values:
        temp_df = temp_df.replace(val, np.nan)
    
    # Calculate missing counts and percentages
    missing_counts = temp_df.isnull().sum()
    missing_percentages = (missing_counts / len(temp_df)) * 100
    missing_summary = pd.DataFrame({
        'Missing Count': missing_counts,
        'Missing Percentage (%)': missing_percentages.round(2)
    }).sort_values(by='Missing Count', ascending=False)
    
    # Filter summary to show only columns with missing data
    missing_summary = missing_summary[missing_summary['Missing Count'] > 0]
    
    # Print summary
    if not missing_summary.empty:
        print("Missing Data Summary:")
        print(missing_summary)
    else:
        print("No missing data detected in the selected column(s).")
    
    # Visualize missing data with heatmap (only if there are missing values)
    if missing_counts.sum() > 0:
        plt.figure(figsize=(10, 6))
        sns.heatmap(temp_df.isnull(), cbar=False, cmap='viridis', yticklabels=False)
        plt.title(f'Missing Data Heatmap {"for " + target_column if target_column else "for All Columns"}')
        plt.show()
    
    return missing_summary

In [None]:
# # Assuming 'df' is your DataFrame
# suspicious = [-999, 'unknown', '', 'N/A']  # Customize based on your data

# # Check for a specific column
# missing_summary = detect_hidden_missing_data(df, suspicious_values=suspicious, target_column='column_name')

# # Check for all columns
# missing_summary_all = detect_hidden_missing_data(df, suspicious_values=suspicious)

### 5.2 Drop All Missing Row (in every feutures)

In [None]:
def drop_missing(df):
    """
    Remove rows with missing values.
    """
    return df.dropna()

In [1]:
# df = drop_missing(df)

### 5.3 Drop All Missing Row (in Specific feutures)

In [None]:
df = df.dropna(subset=['column_name'])

### 5.4 Drop Columns
Use drop_columns when you want to drop entire useless columns from the DataFrame.

In [None]:
def drop_columns(df, columns):
    """Drop specified columns from the DataFrame."""
    return df.drop(columns=columns)

In [None]:
# drop_columns(df, ['col1', 'col2'])

### 5.5 Drop Rows
Use drop_rows when the percentage of missing data is very low.

In [None]:
def drop_rows(df, rows):
    """Drop specified rows from the DataFrame."""
    return df.drop(index=rows)


In [None]:
# df = drop_rows(df , ['row_index1' , 'row_index_2'])

### 5.6 Statistical Imputation
Use fill_with_statistical for numerical features when distribution is stable.
> Note: Before use **statistical imputation** split your Dataset, because of data leakage.

In [None]:
def fill_with_statistical(df, num_col, strategy="mean"):
    """Fill missing with mean, median or mode."""
    if strategy == "mean":
        df[num_col] = df[num_col].fillna(df[num_col].mean())
    elif strategy == "median":
        df[num_col] = df[num_col].fillna(df[num_col].median())
    elif strategy == "mode":
        df[num_col] = df[num_col].fillna(df[num_col].mode().iloc[0])
    return df

In [None]:
# df = fill_with_statistical(df, columns=['num_col1', 'num_col2'], method="mean")

### 5.7 Categorical Imputation
Use fill_categorical for categorical features.
> Note: Before use **categorical imputation** split your Dataset, because of data leakage.

In [1]:
def fill_categorical(df, cat_cols):
    """Fill missing categorical values with mode."""
    for col in cat_cols:
        df[col] = df[col].fillna(df[col].mode()[0])
    return df

In [None]:
# df = fill_categorical(df, cat_cols=['cat_col1', 'cat_col2'])

### 5.8 Forward and Backward Fill
Use interpolation for time-series or continuous numeric data.

In [None]:
def fill_with_ffill_bfill(df, columns, method="ffill"):
    """Fill using forward fill or backward fill."""
    df[columns] = df[columns].fillna(method=method)
    return df

In [None]:
# df = fill_with_ffill_bfill(df, columns=['num_col1', 'num_col2'], method="ffill")

### 5.9 Imputation Techniques
Use KNN imputation when you expect relationships between features.
> Note: Before use **KNN imputation** split your Dataset, because of data leakage.

In [2]:
from sklearn.impute import KNNImputer
def fill_with_knn(df, numeric_columns, n_neighbors=3):
    """Impute missing values using KNN."""
    imputer = KNNImputer(n_neighbors=n_neighbors)
    df[numeric_columns] = imputer.fit_transform(df[numeric_columns])
    return df

In [None]:
# df = fill_with_knn(df, numeric_columns=['num_col1', 'num_col2'])

In [None]:
def fill_with_iterative_imputer(df, numeric_columns, estimator=LinearRegression(), max_iter=10, random_state=42, verbose=False):
    """
    Impute missing values in numerical columns using IterativeImputer.
    
    Parameters:
    - df: DataFrame containing the data.
    - numeric_columns: List of numerical column names to impute.
    - estimator: Estimator for imputation (default: LinearRegression()).
    - max_iter: Maximum number of imputation iterations (default: 10).
    - random_state: Random seed for reproducibility (default: 42).
    - verbose: If True, prints imputation statistics (default: False).
    
    Returns:
    - Imputed DataFrame.
    """
    # Create a copy to avoid modifying the original
    imputed_df = df.copy()
    
    # Count missing values before imputation
    missing_before = imputed_df[numeric_columns].isnull().sum()
    
    # Initialize and fit the imputer
    imputer = IterativeImputer(
        estimator=estimator,
        max_iter=max_iter,
        random_state=random_state,
        skip_complete=True  # Skip columns with no missing values
    )
    imputed_df[numeric_columns] = imputer.fit_transform(imputed_df[numeric_columns])
    
    # Count missing values after imputation
    missing_after = imputed_df[numeric_columns].isnull().sum()
    
    # Print imputation stats if verbose
    if verbose:
        print("Imputation Summary:")
        for col in numeric_columns:
            if missing_before[col] > 0:
                print(f"{col}: Imputed {missing_before[col]} missing values")
        if missing_after.sum() == 0:
            print("All missing values successfully imputed.")
        else:
            print("Warning: Some missing values remain after imputation.")
    
    return imputed_df

In [None]:
# # Assuming 'df' is your DataFrame and 'numeric_columns' is your list of numerical columns
# from sklearn.linear_model import LinearRegression

# # Impute with default settings
# imputed_df = fill_with_iterative_imputer(df, numeric_columns, verbose=True)

# # Impute with a different estimator (e.g., RandomForestRegressor)
# from sklearn.ensemble import RandomForestRegressor
# imputed_df_rf = fill_with_iterative_imputer(
#     df, 
#     numeric_columns, 
#     estimator=RandomForestRegressor(n_estimators=50, random_state=42), 
#     verbose=True
# )

In [None]:
from sklearn.impute import SimpleImputer

def detect_and_impute_missing(df, numerical_strategy='mean', categorical_strategy='most_frequent'):
    """Detect categorical/numerical columns and impute missing values."""
    df = df.copy()
    
    # Detect column types
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    
    print(f"Categorical Columns: {categorical_cols}")
    print(f"Numerical Columns: {numerical_cols}")
    
    # Impute categorical columns
    cat_imputer = None
    if categorical_cols:
        cat_imputer = SimpleImputer(strategy=categorical_strategy)
        df[categorical_cols] = cat_imputer.fit_transform(df[categorical_cols])
    
    # Impute numerical columns
    num_imputer = None
    if numerical_cols:
        num_imputer = SimpleImputer(strategy=numerical_strategy)
        df[numerical_cols] = num_imputer.fit_transform(df[numerical_cols])
    
    # Report missing values before and after
    print("Missing Values Before Imputation:")
    print(df.isna().sum())
    
    return df, cat_imputer, num_imputer

In [None]:
# df_imputed, cat_imputer, num_imputer = detect_and_impute_missing(df, numerical_strategy='mean')

## 6. Outlier Detection & Treatment

Implement IQR-based and Z-score methods.

### 6.1 IQR Outlier Removal

In [None]:
def remove_outliers_iqr(df, num_cols, k=1.5, verbose=True):
    df = df.copy()
    for c in num_cols:
        Q1 = df[c].quantile(0.25)
        Q3 = df[c].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - k * IQR
        upper = Q3 + k * IQR
        before = len(df)
        df = df[(df[c] >= lower) & (df[c] <= upper)]
        after = len(df)
        if verbose:
            print(f"Column {c}: removed {before-after} rows using IQR (k={k})")
    return df

In [None]:
# remove_outliers_iqr(df, numeric_columns)

### 6.2 Z-Score Outlier Removal

In [None]:
import scipy.stats as stats
def remove_outliers_zscore(df, num_col, z_thresh=3.0, verbose=True):
    df = df.copy()
    z_scores = np.abs(stats.zscore(df[num_col].dropna()))
    mask = (z_scores < z_thresh).all(axis=1)
    before = len(df)
    df = df.loc[df[num_col].dropna().index[mask]]
    after = len(df)
    if verbose:
        print(f"Removed {before-after} rows by z-score threshold {z_thresh}")
    return df

In [None]:
# remove_outliers_zscore(df, numeric_columns)

### 6.3 Handling with Log Transfor like Before

In [None]:
## Like Before

### 6.4 Handling with Robust Scaler

It is useful when the data contains outliers that cannot or should not be removed.

In [None]:
from sklearn.preprocessing import RobustScaler

def apply_robust_scaler(df, numeric_columns):
    """Apply RobustScaler to numerical columns."""
    scaler = RobustScaler()
    df[numeric_columns] = scaler.fit_transform(df[numeric_columns])
    return df

In [None]:
# scaled_df = apply_robust_scaler(df, numeric_columns)

## 7. Feature Engineering

Examples: ratio features, date extraction, interaction terms.


### 7.1 Ratio Feature

In [None]:
def add_ratio_feature(df, numerator, denominator, new_name=None):
    df = df.copy()
    new_name = new_name or f"{numerator}_over_{denominator}"
    df[new_name] = df[numerator] / (df[denominator].replace(0, np.nan) + 1e-9)
    return df

In [None]:
# df = add_ratio_feature(df, 'feature1', 'feature2', new_name='feature_ratio')

### 7.2 Date Extraction

In [None]:
def extract_date_parts(df, date_col):
    df = df.copy()
    dt = pd.to_datetime(df[date_col], errors='coerce')
    df[f"{date_col}_year"] = dt.dt.year
    df[f"{date_col}_month"] = dt.dt.month
    df[f"{date_col}_day"] = dt.dt.day
    df[f"{date_col}_weekday"] = dt.dt.weekday
    return df

In [None]:
# df = extract_date_parts(df, 'date_column')

## 8. Feature Selection

Selecting the most important features improves model performance and reduces overfitting.

In [None]:
# change target column
X = df.drop("target", axis=1)
y = df["target"]

### 8.1 SelectKBest (Univariate Selection)

Selects top K features based on statistical tests.

- chi2 → for non-negative features (e.g., counts, frequencies).
- f_classif → for continuous numerical features in classification problems.
- Useful for quick filtering before model training.

In [None]:
from sklearn.feature_selection import SelectKBest, chi2, f_classif
def select_features_statistical(X, y, method, k):
    selector = SelectKBest(score_func=method, k=k)
    selector.fit_transform(X, y)

    selected_features = X.columns[selector.get_support()]
    print(f"Selected Top {k} Features:")
    print(selected_features)

In [None]:
select_features_statistical(X, y, method=f_classif, k=10)

### 8.2 Recursive Feature Elimination (RFE)

Iteratively trains a model and removes the least important features.

- More computationally expensive.
- Works best when you have a moderate number of features (< 100).
- Can be used with any estimator that exposes a coef_ or feature_importances_ attribute

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# Define model
model = LogisticRegression(max_iter=1000)

# Apply RFE
rfe = RFE(estimator=model, n_features_to_select=10)
rfe.fit(X, y)

selected_features_rfe = X.columns[rfe.support_]
print("Selected Features using RFE:")
print(selected_features_rfe)

### 8.3 Feature Importance (Tree-based Models)

Uses built-in feature importance scores from tree-based models (e.g., Random Forest, XGBoost).

- Works only with tree-based models
- Captures non-linear relationships.
- Provides insights into feature relationships and importance.


In [None]:
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import numpy as np

# Train a Random Forest model
rf = RandomForestClassifier(random_state=42)
rf.fit(X, y)

# Get feature importances
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]
selected_features_rf = X.columns[indices[:10]]

print("Top 10 Important Features (Random Forest):")
print(selected_features_rf)

# Plot feature importances
plt.figure(figsize=(8, 5))
plt.barh(X.columns[indices[:10]], importances[indices[:10]])
plt.gca().invert_yaxis()
plt.title("Top 10 Feature Importances (Random Forest)")
plt.xlabel("Importance Score")
plt.show()


## 9. Split dataset

In [None]:
from sklearn.model_selection import train_test_split

X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)

In [None]:
print("X Train set size:", X_train.shape)
print("X Validation set size:", X_val.shape)
print("X Test set size:", X_test.shape)

print("y Train set size:", y_train.shape)
print("y Validation set size:", y_val.shape)
print("y Test set size:", y_test.shape)


## 10. Encoding Categorical Variables

### 10.1 One-Hot Encoding
Best for categorical features without ordinal relationship (e.g. color, city).

In [None]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False, drop='first')
color_encoded_train = encoder.fit_transform(X_train[['color']])
color_encoded_val = encoder.transform(X_val[['color']])
color_encoded_test = encoder.transform(X_test[['color']])

# drop original categorical columns
X_train = X_train.drop(columns=['color'])
X_val = X_val.drop(columns=['color'])
X_test = X_test.drop(columns=['color'])

# concatenate the encoded features with the original dataframe
X_train = X_train.join(pd.DataFrame(color_encoded_train, columns=encoder.get_feature_names_out(['color'])))
X_val = X_val.join(pd.DataFrame(color_encoded_val, columns=encoder.get_feature_names_out(['color'])))
X_test = X_test.join(pd.DataFrame(color_encoded_test, columns=encoder.get_feature_names_out(['color'])))

### 10.2 Label Encoding
Best for binary or nominal categorical features (e.g. gender).

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X_train['gender_encoded'] = le.fit_transform(X_train['gender'])
X_train.drop('gender', axis=1, inplace=True)

X_val['gender_encoded'] = le.transform(X_val['gender'])
X_val.drop('gender', axis=1, inplace=True)

X_test['gender_encoded'] = le.transform(X_test['gender'])
X_test.drop('gender', axis=1, inplace=True)


### 10.3 Ordinal Encoding
Best for ordinal categorical features (e.g. education level).

In [None]:
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder(categories=[['High School','Bachelor','Master','PhD']])
X_train['education_encoded'] = encoder.fit_transform(X_train[['education']])
X_train.drop('education', axis=1, inplace=True)

X_val['education_encoded'] = encoder.transform(X_val[['education']])
X_val.drop('education', axis=1, inplace=True)

X_test['education_encoded'] = encoder.transform(X_test[['education']])
X_test.drop('education', axis=1, inplace=True)

### 10.4 Frequency Encoding

Used for categorical columns with many unique values (high cardinality).

In [None]:
freq = X_train['category'].value_counts(normalize=True)
X_train['category_freq_enc'] = X_train['category'].map(freq)
X_train = X_train.drop(columns=['category'])

X_val['category_freq_enc'] = X_val['category'].map(freq)
X_val = X_val.drop(columns=['category'])

X_test['category_freq_enc'] = X_test['category'].map(freq)
X_test = X_test.drop(columns=['category'])

### 10.5 MultiLabelBinarizer

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

def apply_multilabel_binarizer(df, column_name, separator='|', prefix=None):
    """Encode multi-label column into binary columns using MultiLabelBinarizer."""
    if prefix is None:
        prefix = column_name
    df = df.copy()
    
    # Handle missing values and split labels
    df[column_name] = df[column_name].fillna('').str.split(separator).apply(lambda x: [label.strip() for label in x if label.strip()])
    
    # Apply MultiLabelBinarizer
    mlb = MultiLabelBinarizer()
    mlb_matrix = mlb.fit_transform(df[column_name])
    
    # Create binary columns
    mlb_cols = [f'{prefix}_{label}' for label in mlb.classes_]
    df_mlb = pd.DataFrame(mlb_matrix, columns=mlb_cols, index=df.index)
    df = pd.concat([df, df_mlb], axis=1)
    
    return df, mlb

In [None]:
# df, mlb_model = apply_multilabel_binarizer(df, 'tags', separator='|')

## 11. Numerical Feature Scaling

Choose scaler depending on data distribution:


### 11.1 StandardScaler

Useful when features follow a **Gaussian distribution**.

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train[numeric_columns] = scaler.fit_transform(X_train[numeric_columns])
X_val[numeric_columns] = scaler.transform(X_val[numeric_columns])
X_test[numeric_columns] = scaler.transform(X_test[numeric_columns])

### 11.2 MinMaxScaler

Useful when features have **different scales** but known **min/max ranges**.

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train["name_of_num_col"] = scaler.fit_transform(X_train["name_of_num_col"])
X_val["name_of_num_col"] = scaler.transform(X_val["name_of_num_col"])
X_test["name_of_num_col"] = scaler.transform(X_test["name_of_num_col"])

### 11.3 RobustScaler

Useful for data with **outliers**.

In [None]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
X_train['name_of_num_col'] = scaler.fit_transform(X_train['name_of_num_col'])
X_val['name_of_num_col'] = scaler.transform(X_val['name_of_num_col'])
X_test['name_of_num_col'] = scaler.transform(X_test['name_of_num_col'])

### 11.4 Log Transformation

For **skewed data** to make it more normal.

In [None]:
import numpy as np
X_train['name_of_num_col'] = np.log1p(X_train['name_of_num_col'])
X_val['name_of_num_col'] = np.log1p(X_val['name_of_num_col'])
X_test['name_of_num_col'] = np.log1p(X_test['name_of_num_col'])

## 12. Model Training

### Common Models — Short Description and Usage

### Supervised Learning

- **Logistic Regression**  
  Description: Linear model for binary or multiclass classification; fast, interpretable, assumes linear boundaries.  
  Usage:  
  ```python
  from sklearn.linear_model import LogisticRegression
  model = LogisticRegression(random_state=42)
  model.fit(X_train, y_train)
  ```

- **Decision Tree**  
  Description: Tree-based model capturing non-linear relationships; easy to visualize, prone to overfitting.  
  Usage:  
  ```python
  from sklearn.tree import DecisionTreeClassifier
  model = DecisionTreeClassifier(random_state=42, max_depth=5)
  model.fit(X_train, y_train)
  ```

- **Random Forest**  
  Description: Ensemble of decision trees (bagging); robust, reduces overfitting, good default choice.  
  Usage:  
  ```python
  from sklearn.ensemble import RandomForestClassifier
  model = RandomForestClassifier(random_state=42, n_estimators=100)
  model.fit(X_train, y_train)
  ```

- **Gradient Boosting (sklearn)**  
  Description: Sequential tree boosting; high accuracy, needs tuning, captures complex patterns.  
  Usage:  
  ```python
  from sklearn.ensemble import GradientBoostingClassifier
  model = GradientBoostingClassifier(random_state=42, n_estimators=100)
  model.fit(X_train, y_train)
  ```

- **XGBoost**  
  Description: Optimized gradient boosting; fast, scalable, excels on tabular data, requires tuning.  
  Usage:  
  ```python
  from xgboost import XGBClassifier
  model = XGBClassifier(random_state=42, n_estimators=100, eval_metric='logloss')
  model.fit(X_train, y_train)
  ```

- **Support Vector Machine (SVM)**  
  Description: Effective in high-dimensional spaces with kernel trick for non-linear boundaries; needs scaling.  
  Usage:  
  ```python
  from sklearn.svm import SVC
  model = SVC(kernel='rbf', C=1.0, probability=True)
  model.fit(X_train, y_train)
  ```

- **K-Nearest Neighbors (KNN)**  
  Description: Non-parametric, instance-based; simple, slow at prediction, sensitive to scaling.  
  Usage:  
  ```python
  from sklearn.neighbors import KNeighborsClassifier
  model = KNeighborsClassifier(n_neighbors=5)
  model.fit(X_train, y_train)
  ```

- **Naive Bayes (Gaussian)**  
  Description: Probabilistic, assumes feature independence; fast, works well for text or small datasets.  
  Usage:  
  ```python
  from sklearn.naive_bayes import GaussianNB
  model = GaussianNB()
  model.fit(X_train, y_train)
  ```

- **Multilayer Perceptron (MLP)**  
  Description: Feedforward neural network for non-linear mappings; needs tuning and scaling, computationally intensive.  
  Usage:  
  ```python
  from sklearn.neural_network import MLPClassifier
  model = MLPClassifier(hidden_layer_sizes=(100,), random_state=42, max_iter=300)
  model.fit(X_train, y_train)
  ```

- **Recurrent Neural Network (RNN)**  
  Description: Neural network for sequential data; captures temporal dependencies, suitable for time series or text.  
  Usage (Keras):  
  ```python
  from tensorflow.keras.models import Sequential
  from tensorflow.keras.layers import SimpleRNN, Dense
  model = Sequential([
      SimpleRNN(50, input_shape=(X_train.shape[1], X_train.shape[2]), activation='relu'),
      Dense(1, activation='sigmoid')
  ])
  model.compile(optimizer='adam', loss='binary_crossentropy')
  model.fit(X_train, y_train, epochs=10, batch_size=32)
  ```

- **Long Short-Term Memory (LSTM)**  
  Description: Advanced RNN variant; handles long-term dependencies, ideal for complex sequential data.  
  Usage (Keras):  
  ```python
  from tensorflow.keras.models import Sequential
  from tensorflow.keras.layers import LSTM, Dense
  model = Sequential([
      LSTM(50, input_shape=(X_train.shape[1], X_train.shape[2]), activation='relu'),
      Dense(1, activation='sigmoid')
  ])
  model.compile(optimizer='adam', loss='binary_crossentropy')
  model.fit(X_train, y_train, epochs=10, batch_size=32)
  ```

- **Linear Regression / Ridge / Lasso** (Regression)  
  Description: Linear models for continuous targets; Ridge/Lasso add regularization to prevent overfitting.  
  Usage:  
  ```python
  from sklearn.linear_model import Ridge
  model = Ridge(alpha=1.0)
  model.fit(X_train, y_train)
  ```

- **Support Vector Regressor (SVR)**  
  Description: Non-linear regression with kernels; robust but sensitive to scaling.  
  Usage:  
  ```python
  from sklearn.svm import SVR
  model = SVR(kernel='rbf', C=1.0)
  model.fit(X_train, y_train)
  ```

### Unsupervised Learning

- **K-Means (Clustering)**  
  Description: Partitions data into k clusters by centroid assignment; fast, assumes spherical clusters.  
  Usage:  
  ```python
  from sklearn.cluster import KMeans
  km = KMeans(n_clusters=3, random_state=42).fit(X)
  labels = km.labels_
  ```

- **DBSCAN (Clustering)**  
  Description: Density-based clustering; finds arbitrary-shaped clusters, handles noise, no need to specify k.  
  Usage:  
  ```python
  from sklearn.cluster import DBSCAN
  db = DBSCAN(eps=0.5, min_samples=5).fit(X)
  labels = db.labels_
  ```

- **Isolation Forest (Anomaly Detection)**  
  Description: Tree-based anomaly detection; isolates outliers by random splits, effective for high-dimensional data.  
  Usage:  
  ```python
  from sklearn.ensemble import IsolationForest
  iso = IsolationForest(contamination=0.1, random_state=42).fit(X)
  anomalies = iso.predict(X)  # -1 for outliers, 1 for inliers
  ```

- **Autoencoder (Dimensionality Reduction / Anomaly Detection)**  
  Description: Neural network for unsupervised feature learning; compresses data or detects anomalies via reconstruction error.  
  Usage (Keras):  
  ```python
  from tensorflow.keras.models import Sequential
  from tensorflow.keras.layers import Dense
  model = Sequential([
      Dense(32, activation='relu', input_shape=(X.shape[1],)),
      Dense(16, activation='relu'),
      Dense(32, activation='relu'),
      Dense(X.shape[1], activation='sigmoid')
  ])
  model.compile(optimizer='adam', loss='mse')
  model.fit(X, X, epochs=10, batch_size=32)
  ```

- **PCA (Dimensionality Reduction)**  
  Description: Linear projection to principal components; used for compression or visualization.  
  Usage:  
  ```python
  from sklearn.decomposition import PCA
  pca = PCA(n_components=2).fit(X)
  X_reduced = pca.transform(X)
  ```
    ```

Notes: choose models based on problem type (classification/regression), data size, feature scaling, interpretability needs, and compute budget. Always validate with cross-validation and tune hyperparameters.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

print("Validation Accuracy:", accuracy_score(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))


## 13.Model Evaluation Metrics

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

## Regression Metrics

- **Mean Squared Error (MSE)**  
  Description: Average of squared differences between predictions and actual values; penalizes larger errors heavily.  
  Usage:  
  ```python
  mse = mean_squared_error(y_true, y_pred)
  print(f"MSE: {mse:.4f}")
  ```

- **Root Mean Squared Error (RMSE)**  
  Description: Square root of MSE; interpretable in the same units as the target.  
  Usage:  
  ```python
  rmse = np.sqrt(mean_squared_error(y_true, y_pred))
  print(f"RMSE: {rmse:.4f}")
  ```

- **Mean Absolute Error (MAE)**  
  Description: Average of absolute differences; less sensitive to outliers than MSE.  
  Usage:  
  ```python
  mae = mean_absolute_error(y_true, y_pred)
  print(f"MAE: {mae:.4f}")
  ```

- **R² Score (Coefficient of Determination)**  
  Description: Proportion of variance explained by the model; ranges from 0 to 1 (higher is better).  
  Usage:  
  ```python
  r2 = r2_score(y_true, y_pred)
  print(f"R² Score: {r2:.4f}")
  ```

- **Residual Plot**  
  Description: Visualizes prediction errors (residuals) to assess model fit; ideally, residuals are randomly scattered around 0.  
  Usage:  
  ```python
  residuals = y_true - y_pred
  plt.figure(figsize=(8, 5))
  plt.scatter(y_pred, residuals, alpha=0.5)
  plt.axhline(y=0, color='r', linestyle='--')
  plt.xlabel('Predicted Values')
  plt.ylabel('Residuals')
  plt.title('Residual Plot')
  plt.show()
  ```

## Classification Metrics

- **Accuracy**  
  Description: Proportion of correct predictions; good for balanced datasets.  
  Usage:  
  ```python
  accuracy = accuracy_score(y_true, y_pred)
  print(f"Accuracy: {accuracy:.4f}")
  ```

- **Precision**  
  Description: Proportion of true positives among positive predictions; useful when false positives are costly.  
  Usage:  
  ```python
  precision = precision_score(y_true, y_pred, average='weighted')  # Use 'binary' for binary classification
  print(f"Precision: {precision:.4f}")
  ```

- **Recall (Sensitivity)**  
  Description: Proportion of true positives identified; critical when false negatives are costly.  
  Usage:  
  ```python
  recall = recall_score(y_true, y_pred, average='weighted')  # Use 'binary' for binary classification
  print(f"Recall: {recall:.4f}")
  ```

- **F1 Score**  
  Description: Harmonic mean of precision and recall; balances both for imbalanced datasets.  
  Usage:  
  ```python
  f1 = f1_score(y_true, y_pred, average='weighted')  # Use 'binary' for binary classification
  print(f"F1 Score: {f1:.4f}")
  ```

- **ROC-AUC Score**  
  Description: Area under the ROC curve; measures model's ability to distinguish classes (binary/multiclass). Requires probability scores.  
  Usage:  
  ```python
  roc_auc = roc_auc_score(y_true, y_pred_proba, multi_class='ovr')  # Use y_pred_proba for probabilities
  print(f"ROC-AUC: {roc_auc:.4f}")
  ```

- **Confusion Matrix**  
  Description: Table showing true vs. predicted class counts; helps visualize classification performance.  
  Usage:  
  ```python
  cm = confusion_matrix(y_true, y_pred)
  plt.figure(figsize=(6, 4))
  sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
  plt.xlabel('Predicted')
  plt.ylabel('True')
  plt.title('Confusion Matrix')
  plt.show()
  ```

## Handling Imbalanced Data

In [None]:
import numpy as np
import pandas as pd
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids, TomekLinks, EditedNearestNeighbours
import matplotlib.pyplot as plt
import seaborn as sns

## Oversampling Techniques

- **Random Oversampling**  
  Description: Randomly duplicates samples from the minority class to balance the dataset; simple but risks overfitting.  
  Usage:  
  ```python
  ros = RandomOverSampler(random_state=42)
  X_resampled, y_resampled = ros.fit_resample(X, y)
  print("Class distribution after Random Oversampling:", pd.Series(y_resampled).value_counts())
  ```

- **SMOTE (Synthetic Minority Oversampling Technique)**  
  Description: Generates synthetic samples for the minority class by interpolating between existing samples; reduces overfitting compared to random oversampling.  
  Usage:  
  ```python
  smote = SMOTE(random_state=42)
  X_resampled, y_resampled = smote.fit_resample(X, y)
  print("Class distribution after SMOTE:", pd.Series(y_resampled).value_counts())
  ```

- **ADASYN (Adaptive Synthetic Sampling)**  
  Description: Similar to SMOTE but focuses synthetic samples on harder-to-classify regions (near class boundaries); good for complex datasets.  
  Usage:  
  ```python
  adasyn = ADASYN(random_state=42)
  X_resampled, y_resampled = adasyn.fit_resample(X, y)
  print("Class distribution after ADASYN:", pd.Series(y_resampled).value_counts())
  ```

## Undersampling Techniques

- **Random Undersampling**  
  Description: Randomly removes samples from the majority class to balance the dataset; simple but may discard valuable data.  
  Usage:  
  ```python
  rus = RandomUnderSampler(random_state=42)
  X_resampled, y_resampled = rus.fit_resample(X, y)
  print("Class distribution after Random Undersampling:", pd.Series(y_resampled).value_counts())
  ```

- **Cluster Centroids**  
  Description: Replaces majority class samples with centroids of clusters; preserves structure but may oversimplify data.  
  Usage:  
  ```python
  cc = ClusterCentroids(random_state=42)
  X_resampled, y_resampled = cc.fit_resample(X, y)
  print("Class distribution after Cluster Centroids:", pd.Series(y_resampled).value_counts())
  ```

- **Tomek Links**  
  Description: Removes majority class samples that are closest to minority class samples (links); improves class separation.  
  Usage:  
  ```python
  tl = TomekLinks()
  X_resampled, y_resampled = tl.fit_resample(X, y)
  print("Class distribution after Tomek Links:", pd.Series(y_resampled).value_counts())
  ```

- **Edited Nearest Neighbors (ENN)**  
  Description: Removes majority class samples misclassified by their k-nearest neighbors; cleans noisy data.  
  Usage:  
  ```python
  enn = EditedNearestNeighbours()
  X_resampled, y_resampled = enn.fit_resample(X, y)
  print("Class distribution after ENN:", pd.Series(y_resampled).value_counts())
  ```