In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from scipy.sparse import csr_matrix
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import classification_report


In [2]:
# 1. Load & Initial Cleaning
data_dir = Path(r"E:\decision intelligent\task\3\Predicting On-Time Delivery (Intermediate)\dataset")
raw_file = data_dir / "DataCoSupplyChainDataset.csv"

df = pd.read_csv(raw_file, encoding='latin1', low_memory=False)

df = df.rename(columns={
    'order date (DateOrders)':       'order_date',
    'shipping date (DateOrders)':    'ship_date',
    'Days for shipping (real)':      'shipping_real_days',
    'Days for shipment (scheduled)': 'shipping_sched_days'
})
df['order_date'] = pd.to_datetime(df['order_date'], errors='coerce')
df['ship_date']  = pd.to_datetime(df['ship_date'],  errors='coerce')
df = df.dropna(subset=['order_date','ship_date','shipping_real_days','shipping_sched_days'])

In [3]:
# 2. Feature Engineering
df['lead_time']  = df['shipping_real_days']
df['order_wday'] = df['order_date'].dt.weekday
df['order_hour'] = df['order_date'].dt.hour

obj_cols = df.select_dtypes(include=['object']).columns.tolist()
high_card = [c for c in obj_cols if df[c].nunique() > 50]
df = df.drop(columns=high_card)

low_card = [c for c in obj_cols if c not in high_card]
df = pd.get_dummies(df, columns=low_card, drop_first=True)


In [4]:
# 3. Label Definition
df['on_time'] = (df['shipping_real_days'] <= df['shipping_sched_days']).astype(int)

In [5]:
# 4. Chronological Train/Test Split
df = df.sort_values('order_date')
cutoff = df['order_date'].quantile(0.80)

train = df[df['order_date'] <= cutoff].reset_index(drop=True)
test  = df[df['order_date']  > cutoff].reset_index(drop=True)

drop_cols = ['order_date','ship_date','lead_time','on_time']
X_train_df, y_train = train.drop(columns=drop_cols), train['on_time']
X_test_df,  y_test  = test .drop(columns=drop_cols),  test ['on_time']


In [6]:
# 5. Build Sparse Feature Matrices (CAST TO FLOAT TO AVOID OBJECT DTYPES)
X_train_arr = X_train_df.astype(np.float64).fillna(0).values
X_test_arr  = X_test_df.astype(np.float64).fillna(0).values

X_train_sp = csr_matrix(X_train_arr)
X_test_sp  = csr_matrix(X_test_arr)

In [7]:
# 6. Model Training
# 6.1 Logistic Regression on full sparse
lr = LogisticRegression(solver='saga', max_iter=1000)
lr.fit(X_train_sp, y_train)

# 6.2 Random Forest on 50% subsample
small = train.sample(frac=0.5, random_state=42)
X_small = small.drop(columns=drop_cols)
y_small = small['on_time']

rf = RandomForestClassifier(random_state=42, n_jobs=-1)
rf.fit(X_small, y_small)



In [8]:
# 7. Evaluation of LR & RF
print("=== Logistic Regression ===")
print(classification_report(y_test, lr.predict(X_test_sp)))

print("=== Random Forest (50% subsample) ===")
print(classification_report(y_test, rf.predict(X_test_df)))

feat_imp = pd.Series(rf.feature_importances_, index=X_small.columns)
print("\nTop 10 RF features:\n", feat_imp.sort_values(ascending=False).head(10))

=== Logistic Regression ===
              precision    recall  f1-score   support

           0       0.57      0.85      0.69     20751
           1       0.42      0.15      0.22     15352

    accuracy                           0.55     36103
   macro avg       0.50      0.50      0.45     36103
weighted avg       0.51      0.55      0.49     36103

=== Random Forest (50% subsample) ===
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     20751
           1       1.00      1.00      1.00     15352

    accuracy                           1.00     36103
   macro avg       1.00      1.00      1.00     36103
weighted avg       1.00      1.00      1.00     36103


Top 10 RF features:
 Delivery Status_Late delivery        0.305475
Late_delivery_risk                   0.263465
shipping_real_days                   0.177951
Delivery Status_Shipping on time     0.068246
shipping_sched_days                  0.064293
Shipping Mode_Standard Class

In [9]:
# 8. Inspect Sparse Matrix Size
print(f"\nSparse train matrix shape: {X_train_sp.shape}")


Sparse train matrix shape: (144416, 181)


In [10]:
# 9. Dimensionality Reduction with TruncatedSVD
svd = TruncatedSVD(n_components=50, random_state=42)
X_train_red = svd.fit_transform(X_train_sp)
X_test_red  = svd.transform(X_test_sp)
print(f"Reduced train matrix shape: {X_train_red.shape}")

Reduced train matrix shape: (144416, 50)


In [11]:
# 10. Fast Classifiers on Reduced Data
sgd = SGDClassifier(max_iter=1000, tol=1e-3, random_state=42)
sgd.fit(X_train_sp, y_train)
print("\n=== SGD on full sparse ===")
print(classification_report(y_test, sgd.predict(X_test_sp)))

lr_red = LogisticRegression(max_iter=500, solver='lbfgs')
lr_red.fit(X_train_red, y_train)
print("\n=== Logistic Regression on 50-dim SVD features ===")
print(classification_report(y_test, lr_red.predict(X_test_red)))


=== SGD on full sparse ===
              precision    recall  f1-score   support

           0       0.57      1.00      0.73     20751
           1       0.00      0.00      0.00     15352

    accuracy                           0.57     36103
   macro avg       0.29      0.50      0.36     36103
weighted avg       0.33      0.57      0.42     36103



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



=== Logistic Regression on 50-dim SVD features ===
              precision    recall  f1-score   support

           0       0.68      0.68      0.68     20751
           1       0.57      0.57      0.57     15352

    accuracy                           0.63     36103
   macro avg       0.62      0.62      0.62     36103
weighted avg       0.63      0.63      0.63     36103



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
