In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# Load dataset

In [2]:
df = pd.read_parquet("df_features.parquet")
df

Unnamed: 0,shipment_id,total_detections,avg_confidence,total_damage_area,dent_count,defect_rate,is_high_risk,risk_score,reliability_flag
0,005fee03-03d7-4aa4-8f84-8b13018b2877,2,0.89400,0.007224,1,0.003612,0,0.006458,1
1,0079a599-b9f6-4fff-a74c-2cad09b1142e,2,0.83550,0.009506,1,0.004753,0,0.007942,1
2,0085237f-e8dc-48bf-984f-c1bc28e5bcd5,2,0.83750,0.006832,1,0.003416,0,0.005721,1
3,008a8e02-1738-4a2a-9029-6e22aae02c98,3,0.84600,0.016585,2,0.005528,1,0.014031,0
4,00a3d0d4-a430-49ae-8cd8-d838ea5a0059,2,0.85400,0.010993,1,0.005497,0,0.009388,1
...,...,...,...,...,...,...,...,...,...
921,ffa123eb-e566-4344-ae73-d2c4ff23b326,4,0.87775,0.016754,2,0.004189,1,0.014706,0
922,ffd542e9-aebb-4593-bdc7-9efdaf91a30f,1,0.85100,0.012563,1,0.012563,1,0.010691,0
923,ffd64a02-744f-4020-9972-42d3284d1b85,2,0.87900,0.013600,2,0.006800,1,0.011954,0
924,fff52921-6d3d-4e84-9058-98be3c755254,2,0.91150,0.011580,1,0.005790,1,0.010555,0


## Define features and target

In [3]:
FEATURES = [
    "total_detections",
    "avg_confidence",
    "total_damage_area",
    "dent_count",
    "defect_rate"
]

TARGET = "is_high_risk"

# Define X and y
X = df[FEATURES]
y = df[TARGET]

# Machine learning models

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
import joblib


# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define models
models = {
    "LogisticRegression": LogisticRegression(solver='lbfgs', max_iter=300, n_jobs=-1),
    "XGBoost": XGBClassifier(n_estimators=200, n_jobs=-1, learning_rate=0.01),
    "DecisionTree": DecisionTreeClassifier(min_samples_leaf=5, max_depth=5, min_samples_split=4)
}

# Train each model in a pipeline
results = {}
for name, model in models.items():
    print("="*60)
    print(f"{name} Classification report:")

    # Create pipeline
    pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ("classifier", model)
    ])

    pipeline.fit(X_train, y_train)

    # Evaluation models
    y_pred = pipeline.predict(X_test)
    print(classification_report(y_test, y_pred))

LogisticRegression Classification report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99        68
           1       1.00      0.99      1.00       118

    accuracy                           0.99       186
   macro avg       0.99      1.00      0.99       186
weighted avg       0.99      0.99      0.99       186

XGBoost Classification report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.99        68
           1       0.98      1.00      0.99       118

    accuracy                           0.99       186
   macro avg       0.99      0.99      0.99       186
weighted avg       0.99      0.99      0.99       186

DecisionTree Classification report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99        68
           1       0.99      1.00      1.00       118

    accuracy                           0.99       186
   macro avg       

# Loaded models

In [5]:
loaded_models = {}
model_names = ["LogisticRegression", "XGBoost", "DecisionTree"]

for name in model_names:
    loaded_models[name] = joblib.load(f"models/{name}.pkl")
    print(f"✓ Loaded {name}")

✓ Loaded LogisticRegression
✓ Loaded XGBoost
✓ Loaded DecisionTree
