<a href="https://colab.research.google.com/github/Madhuram99/predictive-maintenance-system/blob/main/predictive_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from IPython import get_ipython
from IPython.display import display
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, f_classif
import pickle
import warnings
warnings.filterwarnings('ignore')

from datetime import datetime, timedelta
import random
# Constants
RANDOM_STATE = 42
TEST_SIZE = 0.2
MODEL_PATH = 'predictive_maintenance_model.pkl'
SCALER_PATH = 'scaler.pkl'



In [3]:
pip install faker pandas numpy

Collecting faker
  Downloading faker-37.1.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.1.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.1.0


In [4]:
class PredictiveMaintenanceModel:
    def __init__(self):
        self.model = None
        self.scaler = None
        self.feature_selector = None
        self.df = None
        self.selected_features = None

    def load_data(self, file_path):
        """Load and preprocess the data"""
        self.df = pd.read_csv(file_path)
        return self.df

    def preprocess_data(self):
        """Convert date to datetime and extract features"""
        # Convert date format (assuming format is dd-mm-yyyy)
        self.df['date'] = pd.to_datetime(self.df['date'],format='%m/%d/%Y', errors='coerce')
        self.df['date'] = self.df['date'].fillna(pd.NaT)
        # Extract temporal features
        self.df['day_of_week'] = self.df['date'].dt.dayofweek
        self.df['day_of_month'] = self.df['date'].dt.day
        self.df['month'] = self.df['date'].dt.month
        self.df['year'] = self.df['date'].dt.year

        # Extract device type from device ID (first character)
        self.df['device_type'] = self.df['device'].str[0]

        # One-hot encode device type
        self.df = pd.get_dummies(self.df, columns=['device_type'], prefix='device')

        # Drop original date and device columns
        self.df = self.df.drop(['date', 'device'], axis=1)

        return self.df

    def feature_engineering(self):
        """Create additional features from existing ones"""
        # Group by device (using the original device column before it was dropped)
        device_groups = self.df.groupby(self.df.index)  # Using index as proxy for device

        # Create rolling features for each metric
        metric_cols = [f'metric{i}' for i in range(1, 10)]
        for col in metric_cols:
            self.df[f'{col}_rolling_mean_7'] = self.df[col].rolling(window=7, min_periods=1).mean()
            self.df[f'{col}_rolling_std_7'] = self.df[col].rolling(window=7, min_periods=1).std()
            self.df[f'{col}_diff_1'] = self.df[col].diff(1)

        # Drop rows with NaN values from feature engineering
        self.df = self.df.dropna()

        return self.df

    def prepare_data(self):
        """Prepare features and target variable"""
        # Separate features and target
        X = self.df.drop('failure', axis=1)
        y = self.df['failure']

        # Handle class imbalance (assuming failures are rare)
        smote = SMOTE(random_state=RANDOM_STATE)
        X_res, y_res = smote.fit_resample(X, y)

        # Feature selection
        selector = SelectKBest(f_classif, k=20)
        X_selected = selector.fit_transform(X_res, y_res)
        self.selected_features = X.columns[selector.get_support()]

        # Split data into train and test sets
        # Using time-based split instead of random split
        split_idx = int(len(X_selected) * (1 - TEST_SIZE))
        X_train, X_test = X_selected[:split_idx], X_selected[split_idx:]
        y_train, y_test = y_res[:split_idx], y_res[split_idx:]

        # Scale features
        self.scaler = StandardScaler()
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)

        return X_train_scaled, X_test_scaled, y_train, y_test

    def train_model(self, X_train, y_train):
        """Train the predictive model"""
        # Using Gradient Boosting which often works well for imbalanced data
        self.model = GradientBoostingClassifier(
            n_estimators=200,
            learning_rate=0.05,
            max_depth=5,
            random_state=RANDOM_STATE,
            subsample=0.8
        )

        self.model.fit(X_train, y_train)
        return self.model

    def evaluate_model(self, X_test, y_test):
        """Evaluate model performance"""
        if not self.model:
            raise ValueError("Model has not been trained yet")

        y_pred = self.model.predict(X_test)
        y_prob = self.model.predict_proba(X_test)[:, 1]

        print("Confusion Matrix:")
        print(confusion_matrix(y_test, y_pred))
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))
        print("\nROC AUC Score:", roc_auc_score(y_test, y_prob))

        return y_pred, y_prob

    def save_model(self):
        """Save model and scaler to disk"""
        if not self.model or not self.scaler:
            raise ValueError("Model or scaler not initialized")

        with open(MODEL_PATH, 'wb') as f:
            pickle.dump({
                'model': self.model,
                'scaler': self.scaler,
                'selected_features': self.selected_features
            }, f)

        print(f"Model saved to {MODEL_PATH}")

        with open(SCALER_PATH, 'wb') as f:
            pickle.dump(self.scaler, f)
        print(f"Scaler saved to {SCALER_PATH}")

    def load_saved_model(self):
        """Load saved model and scaler from disk"""
        try:
            with open(MODEL_PATH, 'rb') as f:
                saved_data = pickle.load(f)
                self.model = saved_data['model']
                self.scaler = saved_data['scaler']
                self.selected_features = saved_data['selected_features']

            print("Model loaded successfully")
            return True
        except FileNotFoundError:
            print("Saved model not found")
            return False

    def predict_failure(self, device_data):
        """Make predictions on new device data"""
        if not self.model or not self.scaler:
            raise ValueError("Model not loaded or trained")

        # Create DataFrame from input data
        device_df = pd.DataFrame([device_data])

        # Ensure all expected features are present
        for feature in self.selected_features:
            if feature not in device_df.columns:
                device_df[feature] = 0  # Fill missing features with 0

        # Select only the features used in training
        device_df = device_df[self.selected_features]

        # Scale features
        processed_data = self.scaler.transform(device_df)

        # Make prediction
        prediction = self.model.predict(processed_data)
        probability = self.model.predict_proba(processed_data)[:, 1]

        return prediction[0], probability[0]

In [5]:
def run_pipeline(data_path):
    """Run the complete pipeline"""
    # Initialize the model
    pm_model = PredictiveMaintenanceModel()

    # Load data
    df = pm_model.load_data(data_path)

    # Preprocess data
    df = pm_model.preprocess_data()

    # Feature engineering
    df = pm_model.feature_engineering()

    # Prepare data for training
    X_train, X_test, y_train, y_test = pm_model.prepare_data()
    print(f"Selected features: {list(pm_model.selected_features)}")

    # Train model
    model = pm_model.train_model(X_train, y_train)

    # Evaluate model
    y_pred, y_prob = pm_model.evaluate_model(X_test, y_test)

    # Save model
    pm_model.save_model()

    return pm_model


In [6]:
if __name__ == "__main__":
    # Run the pipeline
    trained_model = run_pipeline('predictive_ml.csv')

    # Example prediction
    example_data = {
        'metric1': 215630672,
        'metric2': 55,
        'metric3': 0,
        'metric4': 52,
        'metric5': 6,
        'metric6': 407438,
        'metric7': 0,
        'metric8': 0,
        'metric9': 7,
        'day_of_week': 0,
        'day_of_month': 1,
        'month': 1,
        'year': 2015,
        'device_S': 1,
        'device_W': 0,
        'device_Z': 0,
        'metric1_rolling_mean_7': 215630672,
        'metric1_rolling_std_7': 0,
        'metric1_diff_1': 0,
        'metric2_rolling_mean_7': 55,
        'metric2_rolling_std_7': 0,
        'metric2_diff_1': 0,
        'metric3_rolling_mean_7': 0,
        'metric3_rolling_std_7': 0,
        'metric3_diff_1': 0,
        'metric4_rolling_mean_7': 52,
        'metric4_rolling_std_7': 0,
        'metric4_diff_1': 0,
        'metric5_rolling_mean_7': 6,
        'metric5_rolling_std_7': 0,
        'metric5_diff_1': 0,
        'metric6_rolling_mean_7': 407438,
        'metric6_rolling_std_7': 0,
        'metric6_diff_1': 0,
        'metric7_rolling_mean_7': 0,
        'metric7_rolling_std_7': 0,
        'metric7_diff_1': 0,
        'metric8_rolling_mean_7': 0,
        'metric8_rolling_std_7': 0,
        'metric8_diff_1': 0,
        'metric9_rolling_mean_7': 7,
        'metric9_rolling_std_7': 0,
        'metric9_diff_1': 0
    }

    pred, prob = trained_model.predict_failure(example_data)
    print(f"\nExample Prediction: Failure={pred}, Probability={prob:.4f}")

Selected features: ['metric2', 'metric4', 'metric7', 'metric8', 'day_of_week', 'device_S', 'device_W', 'device_Z', 'metric2_rolling_mean_7', 'metric2_rolling_std_7', 'metric2_diff_1', 'metric4_rolling_mean_7', 'metric4_rolling_std_7', 'metric4_diff_1', 'metric7_rolling_mean_7', 'metric7_rolling_std_7', 'metric7_diff_1', 'metric8_rolling_mean_7', 'metric8_rolling_std_7', 'metric8_diff_1']
Confusion Matrix:
[[    0     0]
 [    5 28237]]

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      1.00      1.00     28242

    accuracy                           1.00     28242
   macro avg       0.50      0.50      0.50     28242
weighted avg       1.00      1.00      1.00     28242


ROC AUC Score: nan
Model saved to predictive_maintenance_model.pkl
Scaler saved to scaler.pkl

Example Prediction: Failure=0, Probability=0.0144


In [9]:
from google.colab import files
files.download('predictive_maintenance_model.pkl')  # Downloads the model
files.download('scaler.pkl')  # Downloads the scaler (if needed)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectKBest, f_classif
import pickle
import warnings
warnings.filterwarnings('ignore')

# Constants
RANDOM_STATE = 42
TEST_SIZE = 0.2
MODEL_PATH = 'predictive_maintenance_model.pkl'
SCALER_PATH = 'scaler.pkl'

class PredictiveMaintenanceModel:
    def __init__(self):
        self.model = None
        self.scaler = None
        self.selected_features = None
        self.df = None

    def load_data(self, file_path):
        """Load dataset and introduce synthetic failure cases."""
        self.df = pd.read_csv(file_path)

        # Introduce synthetic failures
        num_failures = int(0.05 * len(self.df))  # 5% failure cases
        failure_indices = np.random.choice(self.df.index, num_failures, replace=False)
        self.df.loc[failure_indices, 'failure'] = 1
        return self.df

    def preprocess_data(self):
        """Convert date to datetime and extract features."""
        self.df['date'] = pd.to_datetime(self.df['date'], errors='coerce')
        self.df['day_of_week'] = self.df['date'].dt.dayofweek
        self.df['month'] = self.df['date'].dt.month
        self.df['year'] = self.df['date'].dt.year
        # One-hot encode the 'device' column before dropping it
        self.df = pd.get_dummies(self.df, columns=['device'], prefix='device')
        self.df.drop(['date'], axis=1, inplace=True)
        return self.df

    def feature_engineering(self):
        """Generate rolling statistics, lag features, and interaction terms."""
        metric_cols = [col for col in self.df.columns if 'metric' in col]
        for col in metric_cols:
            self.df[f'{col}_rolling_mean_7'] = self.df[col].rolling(window=7, min_periods=1).mean()
            self.df[f'{col}_rolling_std_7'] = self.df[col].rolling(window=7, min_periods=1).std()
            self.df[f'{col}_diff_1'] = self.df[col].diff(1)
        self.df.dropna(inplace=True)
        return self.df

    def prepare_data(self):
        """Prepare dataset, apply SMOTE, and feature selection."""
        X = self.df.drop('failure', axis=1)
        y = self.df['failure']

        smote = SMOTE(random_state=RANDOM_STATE)
        X_res, y_res = smote.fit_resample(X, y)

        selector = SelectKBest(f_classif, k=min(20, X_res.shape[1]))
        X_selected = selector.fit_transform(X_res, y_res)
        self.selected_features = X.columns[selector.get_support()]

        X_train, X_test, y_train, y_test = train_test_split(X_selected, y_res, test_size=TEST_SIZE, random_state=RANDOM_STATE)

        self.scaler = StandardScaler()
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)

        return X_train_scaled, X_test_scaled, y_train, y_test

    def train_model(self, X_train, y_train):
        """Train ensemble models: Gradient Boosting and XGBoost."""
        self.model = XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=RANDOM_STATE)
        self.model.fit(X_train, y_train)
        return self.model

    def evaluate_model(self, X_test, y_test):
        """Evaluate model performance."""
        y_pred = self.model.predict(X_test)
        y_prob = self.model.predict_proba(X_test)[:, 1]
        print("Confusion Matrix:")
        print(confusion_matrix(y_test, y_pred))
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))
        print("\nROC AUC Score:", roc_auc_score(y_test, y_prob))
        return y_pred, y_prob

    def save_model(self):
        """Save trained model and scaler."""
        with open(MODEL_PATH, 'wb') as f:
            pickle.dump({'model': self.model, 'scaler': self.scaler, 'selected_features': self.selected_features}, f)
        with open(SCALER_PATH, 'wb') as f:
            pickle.dump(self.scaler, f)
        print("Model and scaler saved.")

    def predict_failure(self, device_data):
        """Predict failure for new device data."""
        device_df = pd.DataFrame([device_data])
        for feature in self.selected_features:
            if feature not in device_df.columns:
                device_df[feature] = 0
        device_df = device_df[self.selected_features]
        processed_data = self.scaler.transform(device_df)
        prediction = self.model.predict(processed_data)
        probability = self.model.predict_proba(processed_data)[:, 1]
        return prediction[0], probability[0]

def run_pipeline(data_path):
    """Run the entire machine learning pipeline."""
    pm_model = PredictiveMaintenanceModel()
    df = pm_model.load_data(data_path)
    df = pm_model.preprocess_data()
    df = pm_model.feature_engineering()
    X_train, X_test, y_train, y_test = pm_model.prepare_data()
    model = pm_model.train_model(X_train, y_train)
    pm_model.evaluate_model(X_test, y_test)
    pm_model.save_model()
    return pm_model

if __name__ == "__main__":
    trained_model = run_pipeline('predictive_ml.csv')
    example_data = {'metric1': 215630672, 'metric2': 55, 'metric3': 0, 'day_of_week': 0, 'month': 1, 'year': 2015}
    pred, prob = trained_model.predict_failure(example_data)
    print(f"Example Prediction: Failure={pred}, Probability={prob:.4f}")


Confusion Matrix:
[[9922  359]
 [8624 1532]]

Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.97      0.69     10281
           1       0.81      0.15      0.25     10156

    accuracy                           0.56     20437
   macro avg       0.67      0.56      0.47     20437
weighted avg       0.67      0.56      0.47     20437


ROC AUC Score: 0.5583118697027853
Model and scaler saved.
Example Prediction: Failure=0, Probability=0.4723


In [10]:
from google.colab import files
files.download('predictive_maintenance_model.pkl')  # Downloads the model
files.download('scaler.pkl')  # Downloads the scaler (if needed)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>