# Maintenance Cost Prediction - Model Development

This notebook documents the end-to-end process of building a Machine Learning model to predict maintenance costs for industrial machinery.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# Set visualization style
sns.set(style="whitegrid")
%matplotlib inline

## 1. Load and Explore Data

In [None]:
# Load the generated dataset
df = pd.read_csv('data/maintenance_data.csv')
print(f"Dataset Shape: {df.shape}")
df.head()

## 2. Exploratory Data Analysis (EDA)

In [None]:
# Distribution of Maintenance Cost
plt.figure(figsize=(10, 6))
sns.histplot(df['Maintenance_Cost'], kde=True, color='blue')
plt.title('Distribution of Maintenance Costs')
plt.show()

In [None]:
# Relationship between Age and Cost
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Age', y='Maintenance_Cost', hue='Maintenance_Type')
plt.title('Machine Age vs Maintenance Cost')
plt.show()

## 3. Model Training and Evaluation

In [None]:
# Define features and target
X = df.drop(['Machine_ID', 'Maintenance_Cost'], axis=1)
y = df['Maintenance_Cost']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing
numeric_features = ['Age', 'Usage_Hours', 'Last_Maintenance_Days', 'Technician_Experience']
categorical_features = ['Maintenance_Type', 'Part_Replacement']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Create pipeline with Random Forest
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train model
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
print(f"MSE: {mean_squared_error(y_test, y_pred):.2f}")
print(f"R2 Score: {r2_score(y_test, y_pred):.4f}")

## 4. Feature Importance

In [None]:
# Extract feature importance
importances = model.named_steps['regressor'].feature_importances_
feature_names = (numeric_features + 
                 list(model.named_steps['preprocessor']
                      .named_transformers_['cat']
                      .get_feature_names_out(categorical_features)))

feat_imp = pd.Series(importances, index=feature_names).sort_values(ascending=False)

plt.figure(figsize=(10, 6))
feat_imp.plot(kind='barh')
plt.title('Feature Importance for Maintenance Cost Prediction')
plt.show()