In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression

# For reproducibility
np.random.seed(42)
plt.style.use('seaborn-v0_8-whitegrid')


In [None]:
# Load California housing dataset
housing = fetch_california_housing()

# Create a DataFrame
X = pd.DataFrame(housing.data, columns=housing.feature_names)
y = pd.Series(housing.target, name='MedHouseValue')

# Display dataset information
print(f"Dataset shape: {X.shape}")
print(f"Features: {X.columns.tolist()}")
print(f"Target variable: {y.name}")
print("\nFeature statistics:")
X.describe().round(2)


In [None]:
# Check for missing values
missing_values = X.isnull().sum()
print("Missing values per column:")
print(missing_values)

# Visualize feature distributions
plt.figure(figsize=(15, 10))
for i, col in enumerate(X.columns):
    plt.subplot(3, 3, i+1)
    sns.histplot(X[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.tight_layout()
plt.show()


In [None]:
# Create a copy of the dataset to modify
X_messy = X.copy()

# 1. Introduce missing values (about 5% of data)
for col in X_messy.columns[:4]:  # Add missing values to first 4 columns
    mask = np.random.rand(len(X_messy)) < 0.05
    X_messy.loc[mask, col] = np.nan

# 2. Add outliers to MedInc column
outlier_idx = np.random.choice(len(X_messy), 20, replace=False)
X_messy.loc[outlier_idx, 'MedInc'] = X_messy['MedInc'].max() * 2.5

# 3. Add a categorical column (for demonstration purposes)
regions = ['coastal', 'inland', 'valley', 'mountain']
X_messy['Region'] = np.random.choice(regions, size=len(X_messy))

# 4. Scale AveRooms to a different range (0-100) to demonstrate scaling issues
X_messy['AveRooms'] = X_messy['AveRooms'] * 10

# Display the modified dataset
print(f"Modified dataset shape: {X_messy.shape}")
print("\nMissing values per column:")
print(X_messy.isnull().sum())
print("\nSample of the messy dataset:")
X_messy.head()


In [None]:
# Visualize the effect of outliers we introduced
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.boxplot(x=X['MedInc'])
plt.title('Original MedInc Distribution')

plt.subplot(1, 2, 2)
sns.boxplot(x=X_messy['MedInc'])
plt.title('MedInc with Outliers')

plt.tight_layout()
plt.show()

# Visualize the scaled AveRooms feature
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.histplot(X['AveRooms'], kde=True)
plt.title('Original AveRooms Distribution')

plt.subplot(1, 2, 2)
sns.histplot(X_messy['AveRooms'], kde=True)
plt.title('Scaled AveRooms Distribution')

plt.tight_layout()
plt.show()


In [None]:
# Check categorical feature distribution
plt.figure(figsize=(8, 5))
sns.countplot(x='Region', data=X_messy)
plt.title('Distribution of Region Categories')
plt.xticks(rotation=45)
plt.show()


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_messy, y, test_size=0.2, random_state=42
)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

# Check missing values in the training set
print("\nMissing values in training set:")
print(X_train.isnull().sum())


In [None]:
# 3.1 Simple Imputation (Mean, Median, Most Frequent)

# Define different imputation strategies for different columns
# - Mean imputation for MedInc (income)
# - Median imputation for HouseAge (to handle skewed distributions better)
# - Most frequent value for AveRooms
# - Constant value (0) for AveBedrms

# Create and fit imputers
mean_imputer = SimpleImputer(strategy='mean')
median_imputer = SimpleImputer(strategy='median')
mode_imputer = SimpleImputer(strategy='most_frequent')
constant_imputer = SimpleImputer(strategy='constant', fill_value=0)

# Apply imputation
X_train_mean_imputed = mean_imputer.fit_transform(X_train[['MedInc']])
X_train_median_imputed = median_imputer.fit_transform(X_train[['HouseAge']])
X_train_mode_imputed = mode_imputer.fit_transform(X_train[['AveRooms']])
X_train_constant_imputed = constant_imputer.fit_transform(X_train[['AveBedrms']])

# Check results (for MedInc column)
print(f"Original MedInc mean: {X_train['MedInc'].mean()}")
print(f"After mean imputation: {np.mean(X_train_mean_imputed)}")
print(f"Missing values before imputation: {X_train['MedInc'].isnull().sum()}")
print(f"Missing values after imputation: {np.isnan(X_train_mean_imputed).sum()}")


In [None]:
# 3.2 KNN Imputation (more advanced)
# This method imputes values based on k nearest neighbors

# Create and fit KNN imputer
knn_imputer = KNNImputer(n_neighbors=5)

# Select numerical columns for KNN imputation
numerical_cols = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
X_train_knn = X_train[numerical_cols].copy()

# Apply KNN imputation
X_train_knn_imputed = knn_imputer.fit_transform(X_train_knn)

# Convert back to DataFrame for comparison
X_train_knn_imputed_df = pd.DataFrame(X_train_knn_imputed, columns=numerical_cols)

# Compare original vs KNN imputed values for MedInc
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.histplot(X_train['MedInc'].dropna(), kde=True, color='blue')
plt.title('Original MedInc (without NaNs)')

plt.subplot(1, 2, 2)
sns.histplot(X_train_knn_imputed_df['MedInc'], kde=True, color='green')
plt.title('KNN Imputed MedInc')

plt.tight_layout()
plt.show()

print(f"Original MedInc mean: {X_train['MedInc'].mean()}")
print(f"After KNN imputation mean: {X_train_knn_imputed_df['MedInc'].mean()}")
print(f"Missing values before imputation: {X_train['MedInc'].isnull().sum()}")
print(f"Missing values after imputation: {X_train_knn_imputed_df['MedInc'].isnull().sum()}")


In [None]:
# Use the imputed data from KNN imputation
X_train_cleaned = X_train_knn_imputed_df.copy()

# 4.1 StandardScaler (mean=0, std=1)
std_scaler = StandardScaler()
X_train_std_scaled = std_scaler.fit_transform(X_train_cleaned)

# 4.2 MinMaxScaler (scales to a range, typically [0, 1])
minmax_scaler = MinMaxScaler()
X_train_minmax_scaled = minmax_scaler.fit_transform(X_train_cleaned)

# 4.3 RobustScaler (uses quantiles, more robust to outliers)
robust_scaler = RobustScaler()
X_train_robust_scaled = robust_scaler.fit_transform(X_train_cleaned)

# Convert to DataFrames for comparison
X_train_std_df = pd.DataFrame(X_train_std_scaled, columns=numerical_cols)
X_train_minmax_df = pd.DataFrame(X_train_minmax_scaled, columns=numerical_cols)
X_train_robust_df = pd.DataFrame(X_train_robust_scaled, columns=numerical_cols)

# Compare scaling methods on the MedInc column (which has outliers)
plt.figure(figsize=(15, 10))

plt.subplot(2, 2, 1)
sns.boxplot(x=X_train_cleaned['MedInc'])
plt.title('Original Data')

plt.subplot(2, 2, 2)
sns.boxplot(x=X_train_std_df['MedInc'])
plt.title('StandardScaler')

plt.subplot(2, 2, 3)
sns.boxplot(x=X_train_minmax_df['MedInc'])
plt.title('MinMaxScaler')

plt.subplot(2, 2, 4)
sns.boxplot(x=X_train_robust_df['MedInc'])
plt.title('RobustScaler')

plt.tight_layout()
plt.show()


In [None]:
# Print scaling statistics
print("Original MedInc stats:")
print(f"Mean: {X_train_cleaned['MedInc'].mean():.4f}")
print(f"Std: {X_train_cleaned['MedInc'].std():.4f}")
print(f"Min: {X_train_cleaned['MedInc'].min():.4f}")
print(f"Max: {X_train_cleaned['MedInc'].max():.4f}")

print("\nStandardScaler MedInc stats:")
print(f"Mean: {X_train_std_df['MedInc'].mean():.4f}")
print(f"Std: {X_train_std_df['MedInc'].std():.4f}")
print(f"Min: {X_train_std_df['MedInc'].min():.4f}")
print(f"Max: {X_train_std_df['MedInc'].max():.4f}")

print("\nMinMaxScaler MedInc stats:")
print(f"Mean: {X_train_minmax_df['MedInc'].mean():.4f}")
print(f"Std: {X_train_minmax_df['MedInc'].std():.4f}")
print(f"Min: {X_train_minmax_df['MedInc'].min():.4f}")
print(f"Max: {X_train_minmax_df['MedInc'].max():.4f}")

print("\nRobustScaler MedInc stats:")
print(f"Mean: {X_train_robust_df['MedInc'].mean():.4f}")
print(f"Std: {X_train_robust_df['MedInc'].std():.4f}")
print(f"Min: {X_train_robust_df['MedInc'].min():.4f}")
print(f"Max: {X_train_robust_df['MedInc'].max():.4f}")


In [None]:
# 5.1 Label Encoding (for ordinal categories)
# This assigns a numeric value to each category

# Create a sample ordinal feature (for demonstration)
X_train_cat = X_train.copy()
quality_map = {'low': 0, 'medium': 1, 'high': 2}
X_train_cat['HousingQuality'] = np.random.choice(['low', 'medium', 'high'], size=len(X_train_cat))

# Apply label encoding
label_encoder = LabelEncoder()
X_train_cat['HousingQuality_encoded'] = label_encoder.fit_transform(X_train_cat['HousingQuality'])

# Display the encoding
print("Label Encoding Results:")
print(pd.crosstab(X_train_cat['HousingQuality'], X_train_cat['HousingQuality_encoded']))

# 5.2 One-Hot Encoding (for nominal categories)
# This creates binary columns for each category

# Apply one-hot encoding to the 'Region' column
onehot_encoder = OneHotEncoder(sparse_output=False)
region_encoded = onehot_encoder.fit_transform(X_train_cat[['Region']])

# Create a DataFrame with the encoded values
region_encoded_df = pd.DataFrame(
    region_encoded, 
    columns=onehot_encoder.get_feature_names_out(['Region'])
)

# Display the first few rows of the one-hot encoded data
print("\nOne-Hot Encoding Results (first 5 rows):")
print(region_encoded_df.head())

# Check how original categories map to one-hot encoded columns
print("\nCategory mapping:")
for i, category in enumerate(onehot_encoder.categories_[0]):
    print(f"Category '{category}' → Column 'Region_{category}'")

# Combine with original data
X_train_with_onehot = pd.concat([X_train_cat, region_encoded_df], axis=1)
print("\nDataFrame with one-hot encoded 'Region' (first 5 rows, selected columns):")
print(X_train_with_onehot[['Region', 'Region_coastal', 'Region_inland', 'Region_mountain', 'Region_valley']].head())


In [None]:
# Use the clean numerical data
X_select = X_train_cleaned.copy()

# Use SelectKBest with f_regression (for regression tasks)
selector = SelectKBest(score_func=f_regression, k=4)  # Select top 4 features
X_selected = selector.fit_transform(X_select, y_train)

# Get feature scores and p-values
scores = selector.scores_
p_values = selector.pvalues_

# Create a DataFrame for visualization
feature_scores = pd.DataFrame({
    'Feature': X_select.columns,
    'Score': scores,
    'p-value': p_values,
    'Selected': selector.get_support()
})

# Sort by score
feature_scores = feature_scores.sort_values('Score', ascending=False)

print("Feature Selection Results:")
print(feature_scores)

# Visualize feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x='Score', y='Feature', data=feature_scores)
plt.title('Feature Importance Scores')
plt.tight_layout()
plt.show()

# Get the selected feature names
selected_features = X_select.columns[selector.get_support()]
print(f"\nSelected features: {', '.join(selected_features)}")


In [None]:
# Get the original messy data again
X_train_original = X_train.copy()
X_test_original = X_test.copy()

# Identify column types
numeric_features = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
categorical_features = ['Region']

# Create preprocessing steps for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', RobustScaler())
])

# Create preprocessing steps for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create the final pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('selector', SelectKBest(f_regression, k=5))
])

# Fit and transform the training data
X_train_preprocessed = pipeline.fit_transform(X_train_original, y_train)

# Transform the test data (using the transformations learned from the training data)
X_test_preprocessed = pipeline.transform(X_test_original)

print(f"Original training data shape: {X_train_original.shape}")
print(f"Preprocessed training data shape: {X_train_preprocessed.shape}")
print(f"Original test data shape: {X_test_original.shape}")
print(f"Preprocessed test data shape: {X_test_preprocessed.shape}")


In [None]:
# Extending the pipeline with a model (Linear Regression)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Create a full pipeline including the model
full_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('selector', SelectKBest(f_regression, k=5)),
    ('model', LinearRegression())
])

# Fit the model on training data
full_pipeline.fit(X_train_original, y_train)

# Make predictions
y_train_pred = full_pipeline.predict(X_train_original)
y_test_pred = full_pipeline.predict(X_test_original)

# Evaluate the model
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("Model Performance:")
print(f"Training MSE: {train_mse:.4f}")
print(f"Test MSE: {test_mse:.4f}")
print(f"Training R²: {train_r2:.4f}")
print(f"Test R²: {test_r2:.4f}")

# Visualize predictions vs actual values
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(y_train, y_train_pred, alpha=0.5)
plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Training Set: Actual vs Predicted')

plt.subplot(1, 2, 2)
plt.scatter(y_test, y_test_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Test Set: Actual vs Predicted')

plt.tight_layout()
plt.show()


In [None]:
# Create an imbalanced classification dataset for demonstration
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from collections import Counter

# Generate imbalanced binary classification data
X_imb, y_imb = make_classification(
    n_samples=1000, n_features=10, n_informative=5, n_redundant=3,
    n_classes=2, weights=[0.9, 0.1], random_state=42
)

# Split the data
X_imb_train, X_imb_test, y_imb_train, y_imb_test = train_test_split(
    X_imb, y_imb, test_size=0.2, random_state=42
)

# Check class distribution
train_class_counts = Counter(y_imb_train)
print("Class distribution in training set:")
print(f"Class 0 (majority): {train_class_counts[0]} samples ({train_class_counts[0]/len(y_imb_train):.1%})")
print(f"Class 1 (minority): {train_class_counts[1]} samples ({train_class_counts[1]/len(y_imb_train):.1%})")

# Visualize class imbalance
plt.figure(figsize=(10, 5))
plt.bar(['Class 0 (Majority)', 'Class 1 (Minority)'], [train_class_counts[0], train_class_counts[1]])
plt.title('Class Distribution in Training Set')
plt.ylabel('Number of Samples')
plt.show()


In [None]:
# Apply different resampling techniques

# 1. Random Oversampling
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X_imb_train, y_imb_train)
ros_class_counts = Counter(y_ros)

# 2. SMOTE (Synthetic Minority Over-sampling Technique)
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_imb_train, y_imb_train)
smote_class_counts = Counter(y_smote)

# 3. Random Undersampling
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X_imb_train, y_imb_train)
rus_class_counts = Counter(y_rus)

# Visualize class distributions after resampling
plt.figure(figsize=(15, 5))

plt.subplot(1, 4, 1)
plt.bar(['Class 0', 'Class 1'], [train_class_counts[0], train_class_counts[1]])
plt.title('Original Data')
plt.ylabel('Number of Samples')

plt.subplot(1, 4, 2)
plt.bar(['Class 0', 'Class 1'], [ros_class_counts[0], ros_class_counts[1]])
plt.title('Random Oversampling')

plt.subplot(1, 4, 3)
plt.bar(['Class 0', 'Class 1'], [smote_class_counts[0], smote_class_counts[1]])
plt.title('SMOTE')

plt.subplot(1, 4, 4)
plt.bar(['Class 0', 'Class 1'], [rus_class_counts[0], rus_class_counts[1]])
plt.title('Random Undersampling')

plt.tight_layout()
plt.show()

# Print sample counts
print("Sample counts after resampling:")
print(f"Original - Class 0: {train_class_counts[0]}, Class 1: {train_class_counts[1]}, Total: {len(y_imb_train)}")
print(f"Random Oversampling - Class 0: {ros_class_counts[0]}, Class 1: {ros_class_counts[1]}, Total: {len(y_ros)}")
print(f"SMOTE - Class 0: {smote_class_counts[0]}, Class 1: {smote_class_counts[1]}, Total: {len(y_smote)}")
print(f"Random Undersampling - Class 0: {rus_class_counts[0]}, Class 1: {rus_class_counts[1]}, Total: {len(y_rus)}")


In [None]:
# Let's visualize the effect of SMOTE in feature space
# We'll focus on just 2 features for visualization

# Use PCA to reduce dimensions for visualization
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_imb_train_2d = pca.fit_transform(X_imb_train)
X_smote_2d = pca.transform(X_smote)

# Plot original vs SMOTE data
plt.figure(figsize=(15, 6))

plt.subplot(1, 2, 1)
plt.scatter(X_imb_train_2d[y_imb_train==0, 0], X_imb_train_2d[y_imb_train==0, 1], 
            label='Class 0 (Majority)', alpha=0.5, s=10)
plt.scatter(X_imb_train_2d[y_imb_train==1, 0], X_imb_train_2d[y_imb_train==1, 1], 
            label='Class 1 (Minority)', alpha=0.8, s=30)
plt.title('Original Imbalanced Data')
plt.xlabel('PCA Feature 1')
plt.ylabel('PCA Feature 2')
plt.legend()

plt.subplot(1, 2, 2)
plt.scatter(X_smote_2d[y_smote==0, 0], X_smote_2d[y_smote==0, 1], 
            label='Class 0', alpha=0.5, s=10)
plt.scatter(X_smote_2d[y_smote==1, 0], X_smote_2d[y_smote==1, 1], 
            label='Class 1 (with synthetic samples)', alpha=0.8, s=30)
plt.title('After SMOTE Oversampling')
plt.xlabel('PCA Feature 1')
plt.ylabel('PCA Feature 2')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
# Set up cross-validation with our pipeline
from sklearn.model_selection import cross_val_score, KFold

# Return to our housing dataset
X_full = X_messy.copy()

# Create the pipeline
preprocessing_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('selector', SelectKBest(f_regression, k=5)),
    ('model', LinearRegression())
])

# Set up cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
cv_scores = cross_val_score(
    preprocessing_pipeline, X_full, y, 
    cv=kf, scoring='r2'
)

# Print results
print("Cross-Validation R² Scores:")
for i, score in enumerate(cv_scores):
    print(f"Fold {i+1}: {score:.4f}")
print(f"\nMean R²: {np.mean(cv_scores):.4f}")
print(f"Standard Deviation: {np.std(cv_scores):.4f}")

# Visualize CV scores
plt.figure(figsize=(10, 5))
plt.bar(range(1, len(cv_scores)+1), cv_scores)
plt.axhline(y=np.mean(cv_scores), color='r', linestyle='--', label=f'Mean R² = {np.mean(cv_scores):.4f}')
plt.xlabel('Fold')
plt.ylabel('R² Score')
plt.title('Cross-Validation Results')
plt.xticks(range(1, len(cv_scores)+1))
plt.legend()
plt.show()
