In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# --- Preprocessing ---
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, PowerTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# --- Imbalance Handling ---
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline # Special pipeline for samplers

# --- Modeling ---
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import miceforest as mf

# --- Evaluation ---
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# --- Settings ---
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

print("--- All libraries imported successfully. ---")


--- All libraries imported successfully. ---


In [4]:
# --- Function : Upgraded Memory Reducer from EDA---

def reduce_memory_usage(df):
    """
    Iterates through all columns of a DataFrame and modifies the data type
    to reduce memory usage.
    
    - Checks if float columns contain only whole numbers.
    - If they do, converts them to a memory-efficient nullable integer type
      (e.g., pd.Int32Dtype()) to preserve NaNs.
    """
    start_mem = df.memory_usage(deep=True).sum() / 1024**2
    print(f'\nInitial memory usage of the dataframe: {start_mem:.2f} MB')

    for col in df.columns:
        col_type = df[col].dtype

        # Skip the categorical 'class' column
        if col == 'class':
            continue
            
        # Downcast numerical columns
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()

            # Check for integers
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            
            # --- START: New Logic for Float Columns ---
            else: # This is a float column
                
                # Check if all non-NaN values are whole numbers
                if df[col].dropna().apply(lambda x: x.is_integer()).all():
                    print(f"Column '{col}' is float but contains only whole numbers. Converting to nullable int.")
                    
                    # We can now use Pandas' Nullable Integer types
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(pd.Int8Dtype())
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(pd.Int16Dtype())
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(pd.Int32Dtype())
                    else:
                        df[col] = df[col].astype(pd.Int64Dtype())
                
                else:
                    # --- This is a "real" float column (has decimals) ---
                    if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)
            # --- END: New Logic ---

    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    print(f'Memory usage after optimization: {end_mem:.2f} MB')
    print(f'Reduced by: {(100 * (start_mem - end_mem) / start_mem):.2f}%')
    return df

print("--- `reduce_memory_usage` function defined. ---")

--- `reduce_memory_usage` function defined. ---


In [None]:

# --- Function : Our Business Cost Function ---

def total_cost(y_true, y_pred):
    '''
    Calculates the total cost based on the problem statement.
    Cost 1 (False Positive) = 10
    Cost 2 (False Negative) = 500
    '''
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    
    try:
        cm = confusion_matrix(y_true, y_pred)
        
        # Handle cases where the model predicts only one class
        if cm.shape == (1, 1):
            if y_true.unique()[0] == 0: # Only negative class
                tn = cm[0, 0]
                fp, fn, tp = 0, 0, 0
            else: # Only positive class
                tp = cm[0, 0]
                tn, fp, fn = 0, 0, 0
        else:
             # Standard 2x2 matrix
            tn, fp, fn, tp = cm.ravel()
            
    except Exception as e:
        # Fallback for complex edge cases
        if len(np.unique(y_pred)) == 1:
            if np.unique(y_pred)[0] == 0: # Predicted all negatives
                tn = np.sum((y_true == 0))
                fn = np.sum((y_true == 1))
                fp, tp = 0, 0
            else: # Predicted all positives
                fp = np.sum((y_true == 0))
                tp = np.sum((y_true == 1))
                tn, fn = 0, 0
        else:
            print(f"Error in confusion matrix calculation: {e}. Returning high cost.")
            return np.inf
            
    cost = (10 * fp) + (500 * fn)
    return cost

print("--- `total_cost` function defined. ---")

--- `total_cost` function defined. ---


In [6]:
# --- 1. Load Data ---
file_path = "https://raw.githubusercontent.com/avnyadav/sensor-fault-detection/main/aps_failure_training_set1.csv"
try:
    df = pd.read_csv(file_path, na_values='na')
    print(f"--- Successfully loaded data from URL. Initial shape: {df.shape} ---")
except Exception as e:
    print(f"Error loading data: {e}")
    # Stop execution if data loading fails
    raise

# --- 2. Reduce Memory ---
# This will use the upgraded function from Cell 1
df = reduce_memory_usage(df)

# --- 3. Encode Target Variable ---
print("\n--- Encoding target variable 'class' ---")
# Check if 'class' column exists
if 'class' not in df.columns:
    print("Error: 'class' column not found.")
else:
    initial_class_counts = df['class'].value_counts()
    print(f"Initial class counts:\n{initial_class_counts}")
    
    # Map 'neg' to 0 and 'pos' to 1
    df['class'] = df['class'].map({'neg': 0, 'pos': 1})
    
    # Verify mapping
    if df['class'].isnull().any():
        print("Warning: Found NaN values in 'class' column after mapping. Check for unexpected values.")
    else:
        # Use a nullable int type here as well, just for consistency
        df['class'] = df['class'].astype(pd.Int8Dtype())
        print(f"Encoded class counts:\n{df['class'].value_counts()}")

--- Successfully loaded data from URL. Initial shape: (36188, 171) ---

Initial memory usage of the dataframe: 48.73 MB
Column 'ab_000' is float but contains only whole numbers. Converting to nullable int.
Column 'ac_000' is float but contains only whole numbers. Converting to nullable int.
Column 'ad_000' is float but contains only whole numbers. Converting to nullable int.
Column 'ae_000' is float but contains only whole numbers. Converting to nullable int.
Column 'af_000' is float but contains only whole numbers. Converting to nullable int.
Column 'ag_000' is float but contains only whole numbers. Converting to nullable int.
Column 'ag_001' is float but contains only whole numbers. Converting to nullable int.
Column 'ag_002' is float but contains only whole numbers. Converting to nullable int.
Column 'ag_003' is float but contains only whole numbers. Converting to nullable int.
Column 'ag_004' is float but contains only whole numbers. Converting to nullable int.
Column 'ag_005' is f

In [7]:

# --- 4. Initial Feature Cleanup ---
print("\n--- Starting initial feature cleanup ---")

# Define features (X) and target (y)
if 'class' in df.columns:
    X = df.drop('class', axis=1)
    y = df['class']
else:
    raise ValueError("Cannot proceed without 'class' column for X/y split.")

print(f"Initial number of features: {X.shape[1]}")

# --- 4a. Drop High-NaN Columns (> 70% missing) ---
missing_percentage = (X.isnull().sum() / len(X)) * 100
cols_to_drop_nan = missing_percentage[missing_percentage > 70].index

if len(cols_to_drop_nan) > 0:
    X = X.drop(columns=cols_to_drop_nan)
    print(f"Dropped {len(cols_to_drop_nan)} columns with > 70% missing values.")
    print(f"Columns dropped: {list(cols_to_drop_nan)}")
else:
    print("No columns had > 70% missing values.")

# --- 4b. Drop Constant Columns (Zero Variance) ---
# We check for columns that have only 1 unique value (or 0 unique values if all NaN)
unique_counts = X.nunique()
cols_to_drop_constant = unique_counts[unique_counts <= 1].index

if len(cols_to_drop_constant) > 0:
    X = X.drop(columns=cols_to_drop_constant)
    print(f"Dropped {len(cols_to_drop_constant)} columns with constant values (zero variance).")
    print(f"Columns dropped: {list(cols_to_drop_constant)}")
else:
    print("No constant columns found.")

# --- 5. Final Check ---
print(f"\n--- Cleanup complete ---")
print(f"Final shape of X: {X.shape}")
print(f"Final shape of y: {y.shape}")

# Store all remaining feature names for the ColumnTransformer
# All remaining features are numeric
numeric_features = list(X.columns)
print(f"Total numeric features remaining: {len(numeric_features)}")


--- Starting initial feature cleanup ---
Initial number of features: 170
Dropped 7 columns with > 70% missing values.
Columns dropped: ['ab_000', 'bn_000', 'bo_000', 'bp_000', 'bq_000', 'br_000', 'cr_000']
Dropped 1 columns with constant values (zero variance).
Columns dropped: ['cd_000']

--- Cleanup complete ---
Final shape of X: (36188, 162)
Final shape of y: (36188,)
Total numeric features remaining: 162


# Analysis of Cell 2 Achievements

## What We Achieved

* **Data Loaded:** We successfully loaded the full dataset from the URL, which started with **36,188 rows** and **171 columns** (170 features + 1 target).

* **Memory Optimized:** The `reduce_memory_usage` function worked perfectly. The log message *Column like 'ab000, ae000 , ad000' is float but contains only whole numbers. Converting them to *nullable *int correct correctly.
    * This stores "integer-like" data more efficiently while correctly preserving `NaN` values.
    * Memory usage was reduced from 48.73 MB to 30.58 MB (a **37.25% reduction**).

* **Target Encoded:** We've successfully converted the `class` column from 'neg'/'pos' to binary **0/1**.

* **Initial Pruning:** We "pruned" our feature set `X` by removing 8 problematic columns:
    * **High-NaN Columns ( $>70\%$):** Dropped 7 columns (`['ab_000', 'bn_000', 'bo_000', 'bp_000', 'bq_000', 'br_000', 'cr_000']`). These are considered noise, not signal.
    * **Constant Column (Zero Variance):** Dropped 1 column (`['cd_000']`). This column had no predictive power.

In [8]:
# --- 6. Create Train-Test Split (The "Data Leakage Barrier") ---
print("\n--- Splitting data into Train and Test sets ---")

# We use stratify=y to ensure the rare positive class is
# represented in both train and test sets proportionally.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

# --- 7. Verify the Split ---
print("\n--- Verification of stratified split: ---")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

print("\n--- Training Set Class Distribution ---")
train_counts = y_train.value_counts()
print(train_counts)
print(f"Positive class percentage in train: {(train_counts[1] / len(y_train) * 100):.2f}%")

print("\n--- Test Set Class Distribution ---")
test_counts = y_test.value_counts()
print(test_counts)
print(f"Positive class percentage in test: {(test_counts[1] / len(y_test) * 100):.2f}%")


--- Splitting data into Train and Test sets ---

--- Verification of stratified split: ---
X_train shape: (28950, 162)
X_test shape: (7238, 162)

--- Training Set Class Distribution ---
class
0    28150
1      800
Name: count, dtype: Int64
Positive class percentage in train: 2.76%

--- Test Set Class Distribution ---
class
0    7038
1     200
Name: count, dtype: Int64
Positive class percentage in test: 2.76%


## Stratification:

* Training Set: Has **800 'pos'** samples (80% of the total 1000).

* Test Set: Has **200 'pos'** samples (20% of the total 1000).

* The **2.76% positive class ratio** is identical in both sets, meaning our test set is a perfect representation of our training set.

In [9]:
# --- 8. Define Phase 1 Pipelines (Pipelines 1, 2, 3) ---
print("--- Defining Preprocessing & Modeling Pipelines ---")

# --- Define the Model we will use as our "Judge" ---
# We use scale_pos_weight for imbalance, as it's often faster
# and a good alternative to SMOTE. Let's calculate it.
# scale_pos_weight = total_negatives / total_positives
train_counts = y_train.value_counts()
scale_pos_weight_value = train_counts[0] / train_counts[1]
print(f"Calculated 'scale_pos_weight' for XGBoost: {scale_pos_weight_value:.2f}")

# We will use this as our constant "judge" model
# We set eval_metric to 'logloss' to avoid a common warning
constant_model = XGBClassifier(
    scale_pos_weight=scale_pos_weight_value,
    random_state=42,
    eval_metric='logloss'
)

# === Pipeline 1: The "Reference" (Control Group) ===
# Hypothesis: Filling NaNs with 0 is the best strategy.
# Steps: 1. Impute(0) -> 2. Scale
preprocessor_1_ref = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
            ('scaler', RobustScaler())])
        , numeric_features)
    ],
    remainder='passthrough'
)

# === Pipeline 2: "EDA-Informed" (Median + Skew Correction) ===
# Hypothesis: Fixing skew is more important.
# Steps: 1. Impute(Median) -> 2. Transform(Yeo-Johnson) -> 3. Scale
preprocessor_2_eda = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('transformer', PowerTransformer(method='yeo-johnson')),
            ('scaler', RobustScaler()) ])
        , numeric_features)
    ],
    remainder='passthrough'
)

# === Pipeline 3: "EDA-Informed" (KNN + Skew Correction) ===
# Hypothesis: Smart imputer (KNN) after fixing skew is best.
# Steps: 1. Transform(Yeo-Johnson) -> 2. Impute(KNN) -> 3. Scale
preprocessor_3_knn = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('transformer', PowerTransformer(method='yeo-johnson')),
            ('imputer', KNNImputer(n_neighbors=5)),
            ('scaler', RobustScaler())
        ]), numeric_features)
    ],
    remainder='passthrough'
)

# --- Now, create the *full* pipelines (Prep + Model) ---
# NOTE: We are using XGBoost's built-in `scale_pos_weight`
# which is simpler and faster than adding a SMOTE step.
# This is a common and robust way to handle imbalance.

pipeline_1 = Pipeline(steps=[
    ('preprocessor', preprocessor_1_ref),
    ('model', constant_model)
])

pipeline_2 = Pipeline(steps=[
    ('preprocessor', preprocessor_2_eda),
    ('model', constant_model)
])

pipeline_3 = Pipeline(steps=[
    ('preprocessor', preprocessor_3_knn),
    ('model', constant_model)
])

print("\n--- Successfully defined 3 complete pipelines: ---")
print("1. pipeline_1 (Reference: Impute 0 -> Scale)")
print("2. pipeline_2 (EDA-Median: Impute Median -> Transform -> Scale)")
print("3. pipeline_3 (EDA-KNN: Transform -> Impute KNN -> Scale)")
print("\nAll pipelines end with XGBClassifier using scale_pos_weight.")

--- Defining Preprocessing & Modeling Pipelines ---
Calculated 'scale_pos_weight' for XGBoost: 35.19

--- Successfully defined 3 complete pipelines: ---
1. pipeline_1 (Reference: Impute 0 -> Scale)
2. pipeline_2 (EDA-Median: Impute Median -> Transform -> Scale)
3. pipeline_3 (EDA-KNN: Transform -> Impute KNN -> Scale)

All pipelines end with XGBClassifier using scale_pos_weight.


# Analysis of Cell 9 Output

## What We Achieved

* **Pipelines Built:** We have successfully defined our first three experimental pipelines, encapsulating our competing hypotheses.
    * **Hypothesis 1 (Reference):** `pipeline_1` is ready to test if `Impute(0) -> Scale` is the best path.
    * **Hypothesis 2 (EDA-Median):** `pipeline_2` is ready to test if `Impute(Median) -> Transform -> Scale` is better.
    * **Hypothesis 3 (EDA-KNN):** `pipeline_3` is ready to test if `Transform -> Impute(KNN) -> Scale` is the best.

* **Imbalance Handled:** We've made a strategic choice to use XGBoost's built-in `scale_pos_weight` parameter (calculated at $35.19$). This is a powerful and computationally efficient way to force the model to pay $35.19\text{x}$ more attention to the rare positive class, directly addressing our core problem (instead of using SMOTE).

---

## Key Findings

* All our "test tubes" are labeled and sitting in the rack. We are now ready to run the actual experiment by fitting these pipelines to the training data and using them to predict on the test data.

In [10]:
import time

# We will store the results of our experiments in this dictionary
phase_1_results = {}

# --- 1. Fit and Evaluate Pipeline 1 (Reference) ---
print("--- Training Pipeline 1 (Reference: Impute 0 -> Scale) ---")
start_time = time.time()

try:
    # Train the pipeline
    pipeline_1.fit(X_train, y_train)
    
    # Get predictions on the test set
    y_pred_1 = pipeline_1.predict(X_test)
    
    # Calculate the total cost
    cost_1 = total_cost(y_test, y_pred_1)
    
    # Store the result
    phase_1_results['Pipeline 1 (Reference)'] = cost_1
    
    end_time = time.time()
    print(f"Pipeline 1 finished in {end_time - start_time:.2f} seconds.")
    print(f"COST for Pipeline 1: ${cost_1:,.0f}")
    
    print("\nClassification Report for Pipeline 1:")
    # We must convert y_test (nullable int) to a standard int for the report
    print(classification_report(y_test.astype(int), y_pred_1, target_names=['Class 0 (neg)', 'Class 1 (pos)']))
    
except Exception as e:
    print(f"Error training Pipeline 1: {e}")
    phase_1_results['Pipeline 1 (Reference)'] = float('inf') # Assign high cost on failure

print("\n" + "="*80 + "\n")

# --- 2. Fit and Evaluate Pipeline 2 (EDA-Median) ---
print("--- Training Pipeline 2 (EDA-Median: Impute Median -> Transform -> Scale) ---")
start_time = time.time()

try:
    # Train the pipeline
    pipeline_2.fit(X_train, y_train)
    
    # Get predictions on the test set
    y_pred_2 = pipeline_2.predict(X_test)
    
    # Calculate the total cost
    cost_2 = total_cost(y_test, y_pred_2)
    
    # Store the result
    phase_1_results['Pipeline 2 (EDA-Median)'] = cost_2
    
    end_time = time.time()
    print(f"Pipeline 2 finished in {end_time - start_time:.2f} seconds.")
    print(f"COST for Pipeline 2: ${cost_2:,.0f}")
    
    print("\nClassification Report for Pipeline 2:")
    print(classification_report(y_test.astype(int), y_pred_2, target_names=['Class 0 (neg)', 'Class 1 (pos)']))
    
except Exception as e:
    print(f"Error training Pipeline 2: {e}")
    phase_1_results['Pipeline 2 (EDA-Median)'] = float('inf')

print("\n" + "="*80 + "\n")

# --- 3. Fit and Evaluate Pipeline 3 (EDA-KNN) ---
print("--- Training Pipeline 3 (EDA-KNN: Transform -> Impute KNN -> Scale) ---")
print("NOTE: This pipeline will take the longest due to KNN Imputer.")
start_time = time.time()

try:
    # Train the pipeline
    pipeline_3.fit(X_train, y_train)
    
    # Get predictions on the test set
    y_pred_3 = pipeline_3.predict(X_test)
    
    # Calculate the total cost
    cost_3 = total_cost(y_test, y_pred_3)
    
    # Store the result
    phase_1_results['Pipeline 3 (EDA-KNN)'] = cost_3
    
    end_time = time.time()
    print(f"Pipeline 3 finished in {end_time - start_time:.2f} seconds.")
    print(f"COST for Pipeline 3: ${cost_3:,.0f}")
    
    print("\nClassification Report for Pipeline 3:")
    print(classification_report(y_test.astype(int), y_pred_3, target_names=['Class 0 (neg)', 'Class 1 (pos)']))
    
except Exception as e:
    print(f"Error training Pipeline 3: {e}")
    phase_1_results['Pipeline 3 (EDA-KNN)'] = float('inf')

print("\n" + "="*80 + "\n")
print("--- Comparison of Pipeline 1, 2, and 3 ---")
print(phase_1_results)

--- Training Pipeline 1 (Reference: Impute 0 -> Scale) ---
Pipeline 1 finished in 9.02 seconds.
COST for Pipeline 1: $19,820

Classification Report for Pipeline 1:
               precision    recall  f1-score   support

Class 0 (neg)       0.99      1.00      0.99      7038
Class 1 (pos)       0.83      0.81      0.82       200

     accuracy                           0.99      7238
    macro avg       0.91      0.90      0.91      7238
 weighted avg       0.99      0.99      0.99      7238



--- Training Pipeline 2 (EDA-Median: Impute Median -> Transform -> Scale) ---
Pipeline 2 finished in 14.10 seconds.
COST for Pipeline 2: $20,350

Classification Report for Pipeline 2:
               precision    recall  f1-score   support

Class 0 (neg)       0.99      1.00      0.99      7038
Class 1 (pos)       0.82      0.80      0.81       200

     accuracy                           0.99      7238
    macro avg       0.91      0.90      0.90      7238
 weighted avg       0.99      0.99      