In [19]:
#-----------------------
# Import libraries
#-------------------------
import pandas as pd
import numpy as np

df = pd.read_csv("data.csv") 
df.head()
df.shape
df.info()
df.describe(include="all")
df.isnull().sum()
print(df["pipe_type"].unique())
print(df["waste_reason"].unique())
print(df["operator"].unique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   date          300 non-null    object 
 1   pipe_type     300 non-null    object 
 2   weight_kg     285 non-null    object 
 3   length_m      296 non-null    float64
 4   downtime_min  205 non-null    float64
 5   waste_kg      198 non-null    float64
 6   waste_reason  234 non-null    object 
 7   operator      300 non-null    object 
dtypes: float64(3), object(5)
memory usage: 18.9+ KB
['PE100-110mm' 'PE80-50mm' 'PE100-160mm' 'PE80-20mm']
['Mechanical Failure' 'Raw Material' 'Overheating' 'Calibration Error' nan]
['three' 'two' 'one']


In [20]:
#-----------------------
# Data cleaning and analys
#-------------------------
from sklearn.preprocessing import StandardScaler

numeric_cols = ["weight_kg", "length_m", "downtime_min", "waste_kg"]
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")
df = df.dropna(subset=["pipe_type", "operator"])


for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())
df = df[(df["weight_kg"] > 0) & (df["weight_kg"] < 2000)]
df = df[(df["length_m"] > 0) & (df["length_m"] < 3000)]
df = df[(df["waste_kg"] >= 0) & (df["waste_kg"] < 100)]
df = df[(df["downtime_min"] >= 0) & (df["downtime_min"] < 500)]
df["waste_reason"] = df["waste_reason"].replace("", "Unknown")

if "date" in df.columns:
    df = df.drop(columns=["date"])
df = pd.get_dummies(df, columns=["pipe_type", "waste_reason", "operator"], drop_first=True)

scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
df.head()

Unnamed: 0,weight_kg,length_m,downtime_min,waste_kg,pipe_type_PE100-160mm,pipe_type_PE80-20mm,pipe_type_PE80-50mm,waste_reason_Mechanical Failure,waste_reason_Overheating,waste_reason_Raw Material,operator_three,operator_two
0,-0.427615,1.483363,-0.564792,0.542764,False,False,False,True,False,False,True,False
1,0.48818,-0.529274,-0.564792,-0.579484,False,False,True,False,False,True,True,False
3,0.065554,0.477726,0.381854,-0.579484,True,False,False,True,False,False,False,True
4,0.200644,-1.239135,-0.564792,1.018211,False,False,False,True,False,False,True,False
5,-0.771618,-1.048633,-0.564792,-0.579484,False,False,True,False,False,True,False,False


In [44]:
#-----------------------
#predict
#-------------------------
from sklearn.model_selection import train_test_split

df["waste_flag"] = np.where(df["waste_kg"] > 0, 1, 0)
df["waste_ratio"] = df["waste_kg"] / df["weight_kg"]
df["length_weight_ratio"] = df["length_m"] / df["weight_kg"]

X = df.drop(columns=["waste_kg", "waste_flag"])
y = df["waste_flag"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((227, 13), (57, 13), (227,), (57,))

In [23]:
#-----------------------
# train models
#-------------------------
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight="balanced"),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42, class_weight="balanced"),
    "XGBoost": XGBClassifier(eval_metric="logloss", random_state=42)
}

for name, model in models.items():
    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_test)
    
    print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
    print("Precision:", round(precision_score(y_test, y_pred), 3))
    print("Recall:", round(recall_score(y_test, y_pred), 3))
    print("F1 Score:", round(f1_score(y_test, y_pred), 3))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.596
Precision: 0.333
Recall: 0.438
F1 Score: 0.378

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.66      0.70        41
           1       0.33      0.44      0.38        16

    accuracy                           0.60        57
   macro avg       0.54      0.55      0.54        57
weighted avg       0.63      0.60      0.61        57

Accuracy: 0.842
Precision: 0.889
Recall: 0.5
F1 Score: 0.64

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.98      0.90        41
           1       0.89      0.50      0.64        16

    accuracy                           0.84        57
   macro avg       0.86      0.74      0.77        57
weighted avg       0.85      0.84      0.83        57

Accuracy: 0.965
Precision: 1.0
Recall: 0.875
F1 Score: 0.933

Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.0

In [24]:
#-----------------------
# GridSearchCV 
# My favorite part
#-------------------------
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from imblearn.ensemble import BalancedRandomForestClassifier

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

rf_params = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}
xgb_params = {
    "n_estimators": [100, 200, 300],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.2],
    "subsample": [0.6, 0.8, 1.0]
}
rf_grid = GridSearchCV(
    BalancedRandomForestClassifier(random_state=42),
    rf_params,
    cv=cv,
    scoring="f1",
    n_jobs=-1
)

rf_grid.fit(X_train_res, y_train_res)
scale_pos = (y_train_res == 0).sum() / (y_train_res == 1).sum()
xgb_grid = GridSearchCV(
    XGBClassifier(random_state=42, eval_metric="logloss", scale_pos_weight=scale_pos),
    xgb_params,
    cv=cv,
    scoring="f1",
    n_jobs=-1
)
xgb_grid.fit(X_train_res, y_train_res)
best_models = {
    "Balanced Random Forest (Tuned)": rf_grid.best_estimator_,
    "XGBoost (Tuned)": xgb_grid.best_estimator_
}

for name, model in best_models.items():
    y_pred = model.predict(X_test)
    print("Best Params:", model.get_params())
    print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
    print("Precision:", round(precision_score(y_test, y_pred), 3))
    print("Recall:", round(recall_score(y_test, y_pred), 3))
    print("F1 Score:", round(f1_score(y_test, y_pred), 3))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

Best Params: {'bootstrap': False, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 300, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'replacement': True, 'sampling_strategy': 'all', 'verbose': 0, 'warm_start': False}
Accuracy: 0.86
Precision: 0.9
Recall: 0.562
F1 Score: 0.692

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.98      0.91        41
           1       0.90      0.56      0.69        16

    accuracy                           0.86        57
   macro avg       0.88      0.77      0.80        57
weighted avg       0.86      0.86      0.85        57

Best Params: {'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsampl

In [28]:
#-----------------------
# Prepare data with Standardize and predict model
# this part was complix
#-------------------------
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

numeric_cols = ["weight_kg", "length_m", "downtime_min", "waste_kg"]
for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")
df = df.dropna(subset=["downtime_min"])

if "waste_ratio" not in df.columns:
    df["waste_ratio"] = df["waste_kg"] / df["weight_kg"]

if "length_weight_ratio" not in df.columns:
    df["length_weight_ratio"] = df["length_m"] / df["weight_kg"]

if "date" in df.columns:
    df = df.drop(columns=["date"])

categorical_cols = [c for c in ["pipe_type", "waste_reason", "operator"] if c in df.columns]
if categorical_cols:
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

features = df.drop(columns=["downtime_min"])
target = df["downtime_min"]
numeric_cols_reg = [c for c in ["weight_kg", "length_m", "waste_ratio", "length_weight_ratio"] if c in features.columns]
scaler = StandardScaler()
features[numeric_cols_reg] = scaler.fit_transform(features[numeric_cols_reg])

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((227, 14), (57, 14), (227,), (57,))

In [43]:
#-----------------------
# Check and fix data
# one week to fix data
# این بخش کد خودم ننوشتم از سایت استک اورفلو کمکم کردن
#-------------------------
Q1 = df["downtime_min"].quantile(0.25)
Q3 = df["downtime_min"].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df_clean = df[(df["downtime_min"] >= lower_bound) & (df["downtime_min"] <= upper_bound)]

skewness = df_clean["downtime_min"].skew()
print("Skewness of downtime_min:", skewness)

if abs(skewness) > 1:
    import numpy as np
    df_clean["downtime_min_log"] = np.log1p(df_clean["downtime_min"])
    target_col = "downtime_min_log"
else:
    target_col = "downtime_min"

features = df_clean.drop(columns=["downtime_min","waste_kg","waste_flag"], errors="ignore")
target = df_clean[target_col]
numeric_cols_reg = [c for c in ["weight_kg", "length_m", "waste_ratio", "length_weight_ratio"] if c in features.columns]
scaler = StandardScaler()
features[numeric_cols_reg] = scaler.fit_transform(features[numeric_cols_reg])

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
print("Data cleaned:", X_train.shape, X_test.shape)

Skewness of downtime_min: 2.4772415072561125
Data cleaned: (192, 13) (48, 13)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean["downtime_min_log"] = np.log1p(df_clean["downtime_min"])


In [38]:
#-----------------------
# Train model
#-------------------------
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

rf = RandomForestRegressor(n_estimators=300, max_depth=10, random_state=42)
xgb = XGBRegressor(n_estimators=300, max_depth=5, learning_rate=0.1, random_state=42, eval_metric="rmse")

models = {"Random Forest Regressor": rf, "XGBoost Regressor": xgb}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_orig = np.expm1(y_pred)
    y_test_orig = np.expm1(y_test)    
    print("R2 Score:", round(r2_score(y_test_orig, y_pred_orig),3))
    print("Mean Absolute Error:", round(mean_absolute_error(y_test_orig, y_pred_orig),3))
    print("Mean Squared Error:", round(mean_squared_error(y_test_orig, y_pred_orig),3))


R2 Score: 0.997
Mean Absolute Error: 0.008
Mean Squared Error: 0.001
R2 Score: 0.995
Mean Absolute Error: 0.009
Mean Squared Error: 0.001
