# End-to-End Predictive Analytics Pipeline

> **"The best way to learn predictive analytics is to build complete systems."**

## Learning Objectives
- Master the complete data science pipeline from raw data to deployment
- Learn advanced feature engineering and selection techniques
- Implement model evaluation and validation strategies
- Build production-ready ML systems
- Apply best practices for real-world ML projects


## 1. Complete Data Science Pipeline

### 1. Problem Definition
- Define business objective
- Identify success metrics
- Determine data requirements

### 2. Data Collection & Integration
- Gather data from multiple sources
- Handle different data formats
- Ensure data quality

### 3. Data Preprocessing
- Handle missing values
- Detect and treat outliers
- Encode categorical variables
- Scale numerical features

### 4. Feature Engineering
- Create new features
- Select relevant features
- Handle feature interactions

### 5. Model Development
- Choose appropriate algorithms
- Train and validate models
- Tune hyperparameters

### 6. Model Deployment
- Deploy to production
- Monitor performance
- Update models regularly


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.feature_selection import SelectKBest, f_classif
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
np.random.seed(42)

print("Libraries imported successfully!")


In [None]:
# Generate sample dataset for our end-to-end pipeline
np.random.seed(42)

# Create a realistic dataset
n_samples = 1000
n_features = 10

# Generate features
X = np.random.randn(n_samples, n_features)

# Add some feature names for better understanding
feature_names = [f'feature_{i}' for i in range(n_features)]

# Create target variable with some relationship to features
# y = 2*feature_0 + 1.5*feature_1 - 0.8*feature_2 + noise
true_coeffs = np.array([2.0, 1.5, -0.8, 0, 0, 0, 0, 0, 0, 0])  # Only first 3 features matter
y = X @ true_coeffs + np.random.normal(0, 0.5, n_samples)

# Add some missing values (5% missing)
missing_mask = np.random.random((n_samples, n_features)) < 0.05
X[missing_mask] = np.nan

# Add some outliers (2% of samples)
outlier_mask = np.random.random(n_samples) < 0.02
X[outlier_mask, 0] += np.random.normal(0, 5, np.sum(outlier_mask))

# Create DataFrame
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y

print("Dataset Overview:")
print("=" * 50)
print(f"Shape: {df.shape}")
print(f"Missing values per column:")
print(df.isnull().sum())
print(f"\nTarget statistics:")
print(df['target'].describe())

# Visualize the data
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Target distribution
axes[0, 0].hist(df['target'], bins=30, alpha=0.7, edgecolor='black')
axes[0, 0].set_title('Target Variable Distribution')
axes[0, 0].set_xlabel('Target Value')
axes[0, 0].set_ylabel('Frequency')

# Feature correlations
correlation_matrix = df.corr()
im = axes[0, 1].imshow(correlation_matrix, cmap='coolwarm', aspect='auto')
axes[0, 1].set_title('Feature Correlation Matrix')
axes[0, 1].set_xticks(range(len(feature_names)))
axes[0, 1].set_yticks(range(len(feature_names)))
axes[0, 1].set_xticklabels(feature_names, rotation=45)
axes[0, 1].set_yticklabels(feature_names)
plt.colorbar(im, ax=axes[0, 1])

# Missing values heatmap
missing_data = df.isnull()
axes[1, 0].imshow(missing_data.T, cmap='viridis', aspect='auto')
axes[1, 0].set_title('Missing Values Pattern')
axes[1, 0].set_xlabel('Sample Index')
axes[1, 0].set_ylabel('Features')

# Feature distributions
df.iloc[:, :5].boxplot(ax=axes[1, 1])
axes[1, 1].set_title('Feature Distributions (First 5)')
axes[1, 1].set_ylabel('Value')

plt.tight_layout()
plt.show()


## 2. Data Preprocessing Pipeline

### Step 1: Handle Missing Values
- **Strategy**: Use median imputation for numerical features
- **Reasoning**: Median is robust to outliers


In [None]:
# Step 1: Handle Missing Values
print("Step 1: Handling Missing Values")
print("=" * 40)

# Check missing values before imputation
print("Missing values before imputation:")
print(df.isnull().sum())

# Impute missing values with median
from sklearn.impute import SimpleImputer

# Separate features and target
X = df.drop('target', axis=1)
y = df['target']

# Impute missing values
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)

# Convert back to DataFrame
X_imputed_df = pd.DataFrame(X_imputed, columns=feature_names)

print(f"\nMissing values after imputation:")
print(X_imputed_df.isnull().sum().sum())

# Step 2: Detect and Handle Outliers
print("\nStep 2: Detecting Outliers")
print("=" * 40)

from sklearn.ensemble import IsolationForest

# Detect outliers
outlier_detector = IsolationForest(contamination=0.1, random_state=42)
outlier_labels = outlier_detector.fit_predict(X_imputed)

# Count outliers
n_outliers = np.sum(outlier_labels == -1)
print(f"Number of outliers detected: {n_outliers}")

# Visualize outliers
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Plot first two features with outliers highlighted
axes[0].scatter(X_imputed_df.iloc[outlier_labels == 1, 0], 
                X_imputed_df.iloc[outlier_labels == 1, 1], 
                alpha=0.6, label='Normal', s=50)
axes[0].scatter(X_imputed_df.iloc[outlier_labels == -1, 0], 
                X_imputed_df.iloc[outlier_labels == -1, 1], 
                alpha=0.8, label='Outliers', s=50, color='red')
axes[0].set_xlabel('Feature 0')
axes[0].set_ylabel('Feature 1')
axes[0].set_title('Outlier Detection')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Box plot showing outliers
X_imputed_df.iloc[:, :5].boxplot(ax=axes[1])
axes[1].set_title('Feature Distributions with Outliers')
axes[1].set_ylabel('Value')

plt.tight_layout()
plt.show()

# Remove outliers
X_clean = X_imputed_df[outlier_labels == 1]
y_clean = y[outlier_labels == 1]

print(f"Dataset shape after removing outliers: {X_clean.shape}")
print(f"Removed {len(X_imputed_df) - len(X_clean)} outliers")
