In [None]:
# CNC Cycle Time Estimation

In [None]:
## 1. Load Libraries and Dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import math

In [None]:
# Load dataset
df = pd.read_csv('cleaned_cnc_data.csv')
df.head()

In [None]:
## 2. Basic EDA
df.info()
df.describe()

In [None]:
# Visualizations
sns.histplot(df['Cycle_Time_min'], kde=True)
plt.title('Distribution of Cycle Time')
plt.show()

In [None]:
sns.scatterplot(data=df, x='Feature_Count', y='Cycle_Time_min', hue='Material_Steel')
plt.title('Cycle Time vs Feature Count (highlighting Steel)')
plt.show()

In [None]:
sns.pairplot(df[['Cycle_Time_min', 'Volume_mm3', 'Feature_Count']])
plt.show()

In [None]:
## 3. Feature Engineering
df['Cost_per_mm3'] = df['Quoted_Cost'] / df['Volume_mm3']
df['Complexity_Index'] = df['Feature_Count'] / df['Volume_mm3']

In [None]:
## 4. Model Training
features = ['Volume_mm3', 'Feature_Count', 'Quoted_Cost', 
            'Material_Brass', 'Material_Copper', 'Material_Plastic', 'Material_Steel', 
            'Cost_per_mm3', 'Complexity_Index']
target = 'Cycle_Time_min'

In [None]:
X = df[features]
y = df[target]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

In [None]:
# Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

In [None]:
## 5. Evaluation
def evaluate_model(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = math.sqrt(mean_squared_error(y_true, y_pred))
    print(f"{model_name} MAE: {mae:.2f}")
    print(f"{model_name} RMSE: {rmse:.2f}")
    sns.scatterplot(x=y_true, y=y_pred)
    plt.xlabel("Actual Cycle Time")
    plt.ylabel("Predicted Cycle Time")
    plt.title(f"{model_name} - Actual vs Predicted")
    plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--')
    plt.show()

In [None]:
evaluate_model(y_test, y_pred_lr, "Linear Regression")
evaluate_model(y_test, y_pred_rf, "Random Forest")

In [None]:
## 6. Feature Importance (Random Forest)
importances = rf.feature_importances_
feature_names = X.columns
feat_imp = pd.Series(importances, index=feature_names).sort_values(ascending=False)

In [None]:
sns.barplot(x=feat_imp, y=feat_imp.index)
plt.title("Feature Importance (Random Forest)")
plt.xlabel("Importance Score")
plt.show()

In [None]:
## 7. Conclusion
print("Random Forest produced better predictions for Cycle Time compared to Linear Regression.")
print("Quoted cost and feature count were key predictors of machining cycle time.")