# Machine Learning Preprocessing Pipeline

This notebook provides a template for a standard machine learning preprocessing pipeline. It includes:
1. Setup & Imports
2. Loading data from CSV
3. Missing Value Analysis & Imputation
4. Basic Cleaning (duplicates, constant features, etc.)
5. Train/Validation/Test Split
6. Outlier Detection
7. Feature Selection
8. Next Steps (Modeling)

Feel free to modify paths, parameters, and classes as needed for your project.

## 1. Setup & Imports
Import all necessary libraries, define helper classes/functions if not already installed.


In [ ]:
# Standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Sklearn & imblearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from imblearn.over_sampling import SMOTE

# Custom modules (adjust import paths as needed)
# from stage2_imputer import Stage2Imputer
# from stage3_outlier_detection import OutlierDetector
# from stage4_scaling_transformation import NumericTransformer
# from split_and_baseline import SplitAndBaseline

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
sns.set(style="whitegrid")

## 2. Load Data
Read your raw CSV data into a pandas DataFrame.

In [ ]:
# TODO: Update the path to your CSV file
DATA_PATH = 'data/raw/your_data.csv'

df = pd.read_csv(DATA_PATH)
df.head()

### 2.1 Quick Profile
- Dimensions
- Data types
- Missing value summary

In [ ]:
# Shape
print(f"Dataset shape: {df.shape}")

# Data types
df.dtypes

In [ ]:
# Missing values
missing_summary = df.isna().mean().sort_values(ascending=False)
missing_summary.head(10)

## 3. Missing Value Analysis & Imputation
Use a dedicated imputer (e.g., Stage2Imputer) or simple strategies here.

In [ ]:
# Example: Using Stage2Imputer (uncomment once imported)
# imputer = Stage2Imputer(max_missing_frac_drop=0.9, knn_neighbors=5, verbose=True)
# df_imputed = imputer.fit_transform(df)
# df_imputed.head()

# For demonstration, a simple approach:
df_imputed = df.copy()
# Numeric median imputation
for col in df_imputed.select_dtypes(include=[np.number]).columns:
    df_imputed[col].fillna(df_imputed[col].median(), inplace=True)
# Categorical mode imputation
for col in df_imputed.select_dtypes(include=['object', 'category']).columns:
    df_imputed[col].fillna(df_imputed[col].mode()[0], inplace=True)

df_imputed.isna().sum().sum()  # Should be zero if no missing remain

## 4. Basic Cleaning
- Remove duplicates
- Remove constant or near-constant features
- High cardinality checks, etc.

In [ ]:
df_clean = df_imputed.copy()

# 4.1 Drop duplicate rows
before_dup = len(df_clean)
df_clean.drop_duplicates(inplace=True)
print(f"Dropped {before_dup - len(df_clean)} duplicate rows")

# 4.2 Remove constant / near-constant columns (e.g., > 99% same value)
constant_cols = [col for col in df_clean.columns 
                 if df_clean[col].nunique(dropna=False) / len(df_clean) < 0.01]
df_clean.drop(columns=constant_cols, inplace=True)
print(f"Dropped {len(constant_cols)} constant/near-constant columns: {constant_cols}")

# 4.3 Optional: Remove high-cardinality categorical features
high_card_cols = [col for col in df_clean.select_dtypes(include=['object', 'category']).columns 
                  if df_clean[col].nunique() > 100]
# df_clean.drop(columns=high_card_cols, inplace=True)
print(f"High-cardinality columns (consider review): {high_card_cols}")

df_clean.shape

## 5. Train/Validation/Test Split
Split your data, optionally apply oversampling on training set.

In [ ]:
# Specify target column
TARGET = 'your_target_column'

X = df_clean.drop(columns=[TARGET])
y = df_clean[TARGET]

# 80/20 train + temp split
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y if y.dtype != 'float' else None
)

# 50/50 validation + test split from temp
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp if y_temp.dtype != 'float' else None
)

# Optional: SMOTE oversampling on training (classification only)
if y_train.dtype != 'float':  # classification
    sm = SMOTE(random_state=42)
    X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
    X_train, y_train = X_train_res, y_train_res

print(f"X_train: {X_train.shape}, X_val: {X_val.shape}, X_test: {X_test.shape}")

## 6. Outlier Detection
Use the `OutlierDetector` class or a custom approach to identify and treat outliers in the training set.

In [ ]:
# Example: Using OutlierDetector (uncomment once imported)
# outlier_detector = OutlierDetector(outlier_threshold=3, robust_covariance=True, cap_outliers=True)
# train_df = pd.concat([X_train, y_train], axis=1)
# train_clean = outlier_detector.fit_transform(train_df, numeric_cols=list(X_train.select_dtypes(include=[np.number]).columns))
# X_train_clean = train_clean.drop(columns=[TARGET])
# y_train_clean = train_clean[TARGET]

# For demonstration: Detect using simple Z-score threshold on numeric features
X_train_num = X_train.select_dtypes(include=[np.number]).copy()
z_scores = np.abs((X_train_num - X_train_num.mean()) / X_train_num.std())
outlier_mask = (z_scores > 3).any(axis=1)
print(f"Detected {outlier_mask.sum()} outliers by simple Z-score method.")
# Optionally drop or cap these rows
X_train_clean = X_train[~outlier_mask]
y_train_clean = y_train[~outlier_mask]
print(f"After dropping: X_train_clean: {X_train_clean.shape}")

## 7. Feature Selection
- Remove low-variance features
- Remove strongly correlated features


In [ ]:
# 7.1 Remove low-variance features (VarianceThreshold)
selector = VarianceThreshold(threshold=0.01)  # features with <1% variance
selector.fit(X_train_clean)
low_variance_cols = X_train_clean.columns[~selector.get_support()].tolist()
print(f"Low-variance columns to drop: {low_variance_cols}")
X_train_fs = X_train_clean.drop(columns=low_variance_cols)
X_val_fs = X_val.drop(columns=low_variance_cols)
X_test_fs = X_test.drop(columns=low_variance_cols)

# 7.2 Remove highly correlated features
corr_matrix = X_train_fs.corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
high_corr_cols = [col for col in upper_tri.columns if (upper_tri[col] > 0.9).any()]
print(f"Highly correlated columns to drop: {high_corr_cols}")
X_train_fs.drop(columns=high_corr_cols, inplace=True)
X_val_fs.drop(columns=high_corr_cols, inplace=True)
X_test_fs.drop(columns=high_corr_cols, inplace=True)

print(f"Final feature set size: {X_train_fs.shape[1]} features")

## 8. Next Steps: Modeling
- Build and evaluate your machine learning models here.
- Example: logistic regression, random forest, etc.


In [ ]:
# Example placeholder:
# from sklearn.linear_model import LogisticRegression
# model = LogisticRegression()
# model.fit(X_train_fs, y_train_clean)
# preds = model.predict(X_val_fs)
# from sklearn.metrics import accuracy_score
# print("Validation accuracy:", accuracy_score(y_val, preds))