In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# --- Preprocessing ---
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, PowerTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# --- Imbalance Handling ---
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline # Special pipeline for samplers

# --- Modeling ---
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import miceforest as mf

# --- Evaluation ---
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# --- Settings ---
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

print("--- All libraries imported successfully. ---")


--- All libraries imported successfully. ---


In [4]:
# --- Function : Upgraded Memory Reducer from EDA---

def reduce_memory_usage(df):
    """
    Iterates through all columns of a DataFrame and modifies the data type
    to reduce memory usage.
    
    - Checks if float columns contain only whole numbers.
    - If they do, converts them to a memory-efficient nullable integer type
      (e.g., pd.Int32Dtype()) to preserve NaNs.
    """
    start_mem = df.memory_usage(deep=True).sum() / 1024**2
    print(f'\nInitial memory usage of the dataframe: {start_mem:.2f} MB')

    for col in df.columns:
        col_type = df[col].dtype

        # Skip the categorical 'class' column
        if col == 'class':
            continue
            
        # Downcast numerical columns
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()

            # Check for integers
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            
            # --- START: New Logic for Float Columns ---
            else: # This is a float column
                
                # Check if all non-NaN values are whole numbers
                if df[col].dropna().apply(lambda x: x.is_integer()).all():
                    print(f"Column '{col}' is float but contains only whole numbers. Converting to nullable int.")
                    
                    # We can now use Pandas' Nullable Integer types
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(pd.Int8Dtype())
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(pd.Int16Dtype())
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(pd.Int32Dtype())
                    else:
                        df[col] = df[col].astype(pd.Int64Dtype())
                
                else:
                    # --- This is a "real" float column (has decimals) ---
                    if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)
            # --- END: New Logic ---

    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    print(f'Memory usage after optimization: {end_mem:.2f} MB')
    print(f'Reduced by: {(100 * (start_mem - end_mem) / start_mem):.2f}%')
    return df

print("--- `reduce_memory_usage` function defined. ---")

--- `reduce_memory_usage` function defined. ---


In [None]:

# --- Function : Our Business Cost Function ---

def total_cost(y_true, y_pred):
    '''
    Calculates the total cost based on the problem statement.
    Cost 1 (False Positive) = 10
    Cost 2 (False Negative) = 500
    '''
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    
    try:
        cm = confusion_matrix(y_true, y_pred)
        
        # Handle cases where the model predicts only one class
        if cm.shape == (1, 1):
            if y_true.unique()[0] == 0: # Only negative class
                tn = cm[0, 0]
                fp, fn, tp = 0, 0, 0
            else: # Only positive class
                tp = cm[0, 0]
                tn, fp, fn = 0, 0, 0
        else:
             # Standard 2x2 matrix
            tn, fp, fn, tp = cm.ravel()
            
    except Exception as e:
        # Fallback for complex edge cases
        if len(np.unique(y_pred)) == 1:
            if np.unique(y_pred)[0] == 0: # Predicted all negatives
                tn = np.sum((y_true == 0))
                fn = np.sum((y_true == 1))
                fp, tp = 0, 0
            else: # Predicted all positives
                fp = np.sum((y_true == 0))
                tp = np.sum((y_true == 1))
                tn, fn = 0, 0
        else:
            print(f"Error in confusion matrix calculation: {e}. Returning high cost.")
            return np.inf
            
    cost = (10 * fp) + (500 * fn)
    return cost

print("--- `total_cost` function defined. ---")

--- `total_cost` function defined. ---


In [6]:
# --- 1. Load Data ---
file_path = "https://raw.githubusercontent.com/avnyadav/sensor-fault-detection/main/aps_failure_training_set1.csv"
try:
    df = pd.read_csv(file_path, na_values='na')
    print(f"--- Successfully loaded data from URL. Initial shape: {df.shape} ---")
except Exception as e:
    print(f"Error loading data: {e}")
    # Stop execution if data loading fails
    raise

# --- 2. Reduce Memory ---
# This will use the upgraded function from Cell 1
df = reduce_memory_usage(df)

# --- 3. Encode Target Variable ---
print("\n--- Encoding target variable 'class' ---")
# Check if 'class' column exists
if 'class' not in df.columns:
    print("Error: 'class' column not found.")
else:
    initial_class_counts = df['class'].value_counts()
    print(f"Initial class counts:\n{initial_class_counts}")
    
    # Map 'neg' to 0 and 'pos' to 1
    df['class'] = df['class'].map({'neg': 0, 'pos': 1})
    
    # Verify mapping
    if df['class'].isnull().any():
        print("Warning: Found NaN values in 'class' column after mapping. Check for unexpected values.")
    else:
        # Use a nullable int type here as well, just for consistency
        df['class'] = df['class'].astype(pd.Int8Dtype())
        print(f"Encoded class counts:\n{df['class'].value_counts()}")

--- Successfully loaded data from URL. Initial shape: (36188, 171) ---

Initial memory usage of the dataframe: 48.73 MB
Column 'ab_000' is float but contains only whole numbers. Converting to nullable int.
Column 'ac_000' is float but contains only whole numbers. Converting to nullable int.
Column 'ad_000' is float but contains only whole numbers. Converting to nullable int.
Column 'ae_000' is float but contains only whole numbers. Converting to nullable int.
Column 'af_000' is float but contains only whole numbers. Converting to nullable int.
Column 'ag_000' is float but contains only whole numbers. Converting to nullable int.
Column 'ag_001' is float but contains only whole numbers. Converting to nullable int.
Column 'ag_002' is float but contains only whole numbers. Converting to nullable int.
Column 'ag_003' is float but contains only whole numbers. Converting to nullable int.
Column 'ag_004' is float but contains only whole numbers. Converting to nullable int.
Column 'ag_005' is f

In [7]:

# --- 4. Initial Feature Cleanup ---
print("\n--- Starting initial feature cleanup ---")

# Define features (X) and target (y)
if 'class' in df.columns:
    X = df.drop('class', axis=1)
    y = df['class']
else:
    raise ValueError("Cannot proceed without 'class' column for X/y split.")

print(f"Initial number of features: {X.shape[1]}")

# --- 4a. Drop High-NaN Columns (> 70% missing) ---
missing_percentage = (X.isnull().sum() / len(X)) * 100
cols_to_drop_nan = missing_percentage[missing_percentage > 70].index

if len(cols_to_drop_nan) > 0:
    X = X.drop(columns=cols_to_drop_nan)
    print(f"Dropped {len(cols_to_drop_nan)} columns with > 70% missing values.")
    print(f"Columns dropped: {list(cols_to_drop_nan)}")
else:
    print("No columns had > 70% missing values.")

# --- 4b. Drop Constant Columns (Zero Variance) ---
# We check for columns that have only 1 unique value (or 0 unique values if all NaN)
unique_counts = X.nunique()
cols_to_drop_constant = unique_counts[unique_counts <= 1].index

if len(cols_to_drop_constant) > 0:
    X = X.drop(columns=cols_to_drop_constant)
    print(f"Dropped {len(cols_to_drop_constant)} columns with constant values (zero variance).")
    print(f"Columns dropped: {list(cols_to_drop_constant)}")
else:
    print("No constant columns found.")

# --- 5. Final Check ---
print(f"\n--- Cleanup complete ---")
print(f"Final shape of X: {X.shape}")
print(f"Final shape of y: {y.shape}")

# Store all remaining feature names for the ColumnTransformer
# All remaining features are numeric
numeric_features = list(X.columns)
print(f"Total numeric features remaining: {len(numeric_features)}")


--- Starting initial feature cleanup ---
Initial number of features: 170
Dropped 7 columns with > 70% missing values.
Columns dropped: ['ab_000', 'bn_000', 'bo_000', 'bp_000', 'bq_000', 'br_000', 'cr_000']
Dropped 1 columns with constant values (zero variance).
Columns dropped: ['cd_000']

--- Cleanup complete ---
Final shape of X: (36188, 162)
Final shape of y: (36188,)
Total numeric features remaining: 162


# Analysis of Cell 2 Achievements

## What We Achieved

* **Data Loaded:** We successfully loaded the full dataset from the URL, which started with **36,188 rows** and **171 columns** (170 features + 1 target).

* **Memory Optimized:** The `reduce_memory_usage` function worked perfectly. The log message *Column like 'ab000, ae000 , ad000' is float but contains only whole numbers. Converting them to *nullable *int correct correctly.
    * This stores "integer-like" data more efficiently while correctly preserving `NaN` values.
    * Memory usage was reduced from 48.73 MB to 30.58 MB (a **37.25% reduction**).

* **Target Encoded:** We've successfully converted the `class` column from 'neg'/'pos' to binary **0/1**.

* **Initial Pruning:** We "pruned" our feature set `X` by removing 8 problematic columns:
    * **High-NaN Columns ( $>70\%$):** Dropped 7 columns (`['ab_000', 'bn_000', 'bo_000', 'bp_000', 'bq_000', 'br_000', 'cr_000']`). These are considered noise, not signal.
    * **Constant Column (Zero Variance):** Dropped 1 column (`['cd_000']`). This column had no predictive power.