In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [15]:
# 1. Load dataset
df = pd.read_csv("synthetic_device_failure_dataset_v4.csv")

In [16]:
df.columns


Index(['device_id', 'device_name', 'manufacturer', 'device_age_years',
       'usage_hours_per_week', 'maintenance_frequency_per_year',
       'last_maintenance_gap_days', 'error_logs_past_month', 'environment',
       'criticality_level', 'spare_parts_availability', 'failures_past_year',
       'manufacturer_support_rating', 'failure_within_year'],
      dtype='object')

In [17]:
# 2. Define features & target
X = df.drop(columns=["device_id","failure_within_year"])
y = df["failure_within_year"]

In [18]:
# 3. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [19]:
df.describe()

Unnamed: 0,device_id,device_age_years,usage_hours_per_week,maintenance_frequency_per_year,last_maintenance_gap_days,error_logs_past_month,failures_past_year,manufacturer_support_rating,failure_within_year
count,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
mean,1500.5,6.042667,59.444,4.871,132.788333,9.770667,0.120333,3.118333,0.408
std,866.169729,2.498585,17.922461,3.183248,95.915349,4.639603,0.348177,1.08937,0.491545
min,1.0,0.0,5.0,0.0,20.0,0.0,0.0,1.0,0.0
25%,750.75,4.0,47.0,2.0,72.0,7.0,0.0,2.0,0.0
50%,1500.5,6.0,60.0,4.0,97.0,9.0,0.0,3.0,0.0
75%,2250.25,8.0,72.0,6.0,179.25,12.0,0.0,4.0,1.0
max,3000.0,16.0,127.0,12.0,404.0,31.0,3.0,5.0,1.0


In [22]:
df.head()


Unnamed: 0,device_id,device_name,manufacturer,device_age_years,usage_hours_per_week,maintenance_frequency_per_year,last_maintenance_gap_days,error_logs_past_month,environment,criticality_level,spare_parts_availability,failures_past_year,manufacturer_support_rating,failure_within_year
0,1,Ventilator,Fujifilm,7,27,4,117,7,Ward,Medium,Poor,0,2,0
1,2,CT Scanner,Siemens,6,86,4,107,13,ICU,Low,Moderate,0,5,0
2,3,Ultrasound,GE Healthcare,8,84,2,202,17,ICU,Low,Good,0,5,1
3,4,CT Scanner,Mindray,5,66,6,72,12,Diagnostic Center,Medium,Moderate,0,1,0
4,5,CT Scanner,GE Healthcare,6,75,6,74,16,ICU,Medium,Poor,0,3,0


In [23]:
# 4. Identify categorical & numeric columns
categorical_cols = ["device_name", "manufacturer", "environment", 
                    "criticality_level", "spare_parts_availability"]
numeric_cols = [c for c in X.columns if c not in categorical_cols]

# Preprocessor: OHE for categoricals + scaling for numerics
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", StandardScaler(), numeric_cols)
    ]
)


In [24]:
# =======================
# 1) Logistic Regression
# =======================
logreg = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("clf", LogisticRegression(max_iter=500, class_weight="balanced", random_state=42))
])
logreg.fit(X_train, y_train)
y_pred_log = logreg.predict(X_test)
print("\n=== Logistic Regression ===")
print(classification_report(y_test, y_pred_log))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log))



=== Logistic Regression ===
              precision    recall  f1-score   support

           0       0.91      0.93      0.92       355
           1       0.89      0.87      0.88       245

    accuracy                           0.90       600
   macro avg       0.90      0.90      0.90       600
weighted avg       0.90      0.90      0.90       600

Confusion Matrix:
 [[329  26]
 [ 32 213]]


In [25]:
# =======================
# 2) Random Forest
# =======================
rf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("clf", RandomForestClassifier(
        n_estimators=200, max_depth=10, random_state=42, class_weight="balanced"
    ))
])
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("\n=== Random Forest ===")
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))



=== Random Forest ===
              precision    recall  f1-score   support

           0       0.89      0.92      0.90       355
           1       0.88      0.83      0.85       245

    accuracy                           0.89       600
   macro avg       0.88      0.88      0.88       600
weighted avg       0.88      0.89      0.88       600

Confusion Matrix:
 [[328  27]
 [ 42 203]]


In [26]:
# =======================
# 3) Gradient Boosting
# =======================
gb = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("clf", GradientBoostingClassifier(random_state=42))
])
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)
print("\n=== Gradient Boosting ===")
print(classification_report(y_test, y_pred_gb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_gb))



=== Gradient Boosting ===
              precision    recall  f1-score   support

           0       0.91      0.93      0.92       355
           1       0.90      0.87      0.88       245

    accuracy                           0.91       600
   macro avg       0.90      0.90      0.90       600
weighted avg       0.91      0.91      0.91       600

Confusion Matrix:
 [[330  25]
 [ 31 214]]


In [27]:
import joblib

# Save Gradient Boosting model (best one)
joblib.dump(gb, "best_model_gb.joblib")
print("Saved Gradient Boosting model to best_model_gb.joblib")


Saved Gradient Boosting model to best_model_gb.joblib


In [28]:
import pandas as pd
import joblib

# Load trained model
model = joblib.load("best_model_gb.joblib")

# Example new device (dictionary of feature values)
new_device = {
    "device_id": 5001,
    "device_name": "CT Scanner",
    "manufacturer": "Siemens",
    "device_age_years": 7,
    "usage_hours_per_week": 80,
    "maintenance_frequency_per_year": 2,
    "last_maintenance_gap_days": 120,
    "error_logs_past_month": 15,
    "environment": "Diagnostic Center",
    "criticality_level": "High",
    "spare_parts_availability": "Moderate",
    "failures_past_year": 1,
    "manufacturer_support_rating": 3
}

# Convert to DataFrame (single row, like training features)
new_df = pd.DataFrame([new_device])
new_df = new_df.drop(columns=["device_id"])  # drop ID if model was trained without it

# Make predictions
pred_class = model.predict(new_df)[0]
pred_prob = model.predict_proba(new_df)[:, 1][0]

print("Predicted failure (0=no, 1=yes):", pred_class)
print("Failure probability:", round(pred_prob, 3))


Predicted failure (0=no, 1=yes): 1
Failure probability: 0.977


In [29]:
import joblib
model = joblib.load("best_model_gb.joblib")
print(type(model))                # should be sklearn.pipeline.Pipeline
print(model.named_steps.keys())   # should include 'preprocessor' and 'clf' (or similar)


<class 'sklearn.pipeline.Pipeline'>
dict_keys(['preprocessor', 'clf'])
