# AI-Assisted Production Process Optimization

This notebook explores how production parameters affect defect probability using explainable ML. This dataset simulates a real production line based on industrial assumptions.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score

sns.set_theme(style="whitegrid")

In [None]:
DATA_PATH = "../data/production_data.csv"
df = pd.read_csv(DATA_PATH)
df.head()

## Quick checks
- Binary classification target: `defect`
- Expected drivers: higher temperature + higher line speed, night shift risk, less experience.
- We'll remove outliers (IQR), one-hot encode `shift`, then train Logistic Regression and Random Forest.

In [None]:
def remove_outliers_iqr(frame, numeric_cols, whisker_width=1.5):
    cleaned = frame.copy()
    for col in numeric_cols:
        q1 = cleaned[col].quantile(0.25)
        q3 = cleaned[col].quantile(0.75)
        iqr = q3 - q1
        lower = q1 - whisker_width * iqr
        upper = q3 + whisker_width * iqr
        cleaned = cleaned[(cleaned[col] >= lower) & (cleaned[col] <= upper)]
    return cleaned.reset_index(drop=True)

numeric_cols = ["temperature", "line_speed", "operator_experience", "machine_age"]
df_clean = remove_outliers_iqr(df, numeric_cols)
df_clean.describe()

In [None]:
X = df_clean.drop(columns=["defect"])
y = df_clean["defect"]

categorical_features = ["shift"]
numeric_features = ["temperature", "line_speed", "operator_experience", "machine_age"]

numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])
categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore")),
])
preprocess = ColumnTransformer([
    ("num", numeric_pipeline, numeric_features),
    ("cat", categorical_pipeline, categorical_features),
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
log_reg = Pipeline([
    ("preprocess", preprocess),
    ("model", LogisticRegression(max_iter=600, class_weight="balanced")),
])
rf = Pipeline([
    ("preprocess", preprocess),
    ("model", RandomForestClassifier(
        n_estimators=300,
        max_depth=12,
        min_samples_split=4,
        random_state=42,
        n_jobs=-1,
    )),
])

log_reg.fit(X_train, y_train)
rf.fit(X_train, y_train)

for name, model in [("LogReg", log_reg), ("RandomForest", rf)]:
    proba = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, proba)
    print(name, "AUC:", round(auc, 3))
    print(classification_report(y_test, model.predict(X_test)))

In [None]:
# Feature importance (Random Forest)
feature_names = numeric_features + list(preprocess.named_transformers_["cat"]["encoder"].get_feature_names_out(categorical_features))
fi = pd.Series(rf.named_steps["model"].feature_importances_, index=feature_names).sort_values(ascending=False)
plt.figure(figsize=(8, 5))
sns.barplot(x=fi.values, y=fi.index, palette="viridis")
plt.title("Feature importance")
plt.show()

In [None]:
# Temperature vs defect probability
df_plot = df_clean.copy()
df_plot["pred"] = rf.predict_proba(df_plot.drop(columns=["defect"]))[:, 1]
df_plot["temp_bin"] = pd.cut(df_plot["temperature"], bins=12)
curve = df_plot.groupby("temp_bin").agg(pred_prob=("pred", "mean"), actual_rate=("defect", "mean"))
curve["temp_center"] = curve.index.map(lambda b: (b.left + b.right) / 2)
curve.reset_index(inplace=True)

plt.figure(figsize=(8, 5))
sns.lineplot(data=curve, x="temp_center", y="pred_prob", label="Predicted defect prob")
sns.lineplot(data=curve, x="temp_center", y="actual_rate", label="Observed defect rate")
plt.xlabel("Temperature (Â°C)")
plt.ylabel("Defect probability")
plt.title("Temperature vs defect probability")
plt.show()