In [5]:
import pandas as pd
import numpy as np

# Load the dataset
file_path = "../../data/dynamic_supply_chain_logistics_dataset.csv"
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,timestamp,vehicle_gps_latitude,vehicle_gps_longitude,fuel_consumption_rate,eta_variation_hours,traffic_congestion_level,warehouse_inventory_level,loading_unloading_time,handling_equipment_availability,order_fulfillment_status,...,iot_temperature,cargo_condition_status,route_risk_level,customs_clearance_time,driver_behavior_score,fatigue_monitoring_score,disruption_likelihood_score,delay_probability,risk_classification,delivery_time_deviation
0,2021-01-01 00:00:00,40.375568,-77.014318,5.136512,4.998009,5.927586,985.716862,4.951392,0.481294,0.761166,...,0.5744,0.777263,1.182116,0.502006,0.033843,0.978599,0.506152,0.885291,Moderate Risk,9.110682
1,2021-01-01 01:00:00,33.507818,-117.036902,5.101512,0.984929,1.591992,396.700206,1.030379,0.62078,0.196594,...,-9.753493,0.091839,9.611988,0.966774,0.201725,0.918586,0.980784,0.544178,High Risk,8.175281
2,2021-01-01 02:00:00,30.02064,-75.269224,5.090803,4.972665,8.787765,832.408935,4.220229,0.810933,0.152742,...,-6.491034,0.253529,6.570431,0.945627,0.264045,0.394215,0.998633,0.803322,High Risk,1.283594
3,2021-01-01 03:00:00,36.649223,-70.190529,8.219558,3.095064,0.045257,0.573283,0.530186,0.008525,0.811885,...,-0.151276,0.877576,0.548952,4.674035,0.362885,0.905444,0.99332,0.025977,High Risk,9.304897
4,2021-01-01 04:00:00,30.001279,-70.012195,5.000075,3.216077,8.004851,914.925067,3.62089,0.020083,0.053659,...,2.429448,0.262081,8.861443,3.445429,0.016957,0.258702,0.912433,0.991122,High Risk,7.752484


In [6]:
# @title 1st option - build from scratch
class LinearRegressionScratch:
    """
    Custom implementation of linear regression using gradient descent.
    """
    def __init__(self, learning_rate=0.01, n_iterations=1000):
        self.coef_ = None
        self.intercept_ = None
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations

    def fit(self, X, y):
        """
        Fits the linear regression model to the given data using gradient descent.
        Args:
            X: A numpy array of shape (n_samples, n_features) representing the input data.
            y: A numpy array of shape (n_samples,) representing the target values.
        """
        # Initialize coefficients (slope) and intercept to zero or random small values
        self.coef_ = np.dot(np.linalg.inv(np.dot(X.T, X)), np.dot(X.T, y))
        self.intercept_ = np.mean(y - np.dot(X, self.coef_))

        m = len(y)  # number of training examples

        for _ in range(self.n_iterations):
            # Calculate the predictions
            y_pred = self.predict(X)

            # Compute the residuals (errors)
            error = (y_pred - y)

            # Calculate the gradient for intercept (slope) and coefficients (slope)
            intercept_gradient = (1/m) * np.sum(error)
            coef_gradient = (1/m) * np.dot(X.T, error)

            # Update the parameters using the gradients
            self.intercept_ -= self.learning_rate * intercept_gradient
            self.coef_ -= self.learning_rate * coef_gradient

    def predict(self, X):
        """
        Predicts the target values for new data.
        Args:
            X: A numpy array of shape (n_samples, n_features) representing the new input data.
        Returns:
            A numpy array of shape (n_samples,) representing the predicted target values.
        """
        return np.dot(X, self.coef_) + self.intercept_

In [7]:
# Cleaning Data
numerical_list = [x for x in df.columns if df[x].dtype in ('int64','float64')]
print(numerical_list)

['vehicle_gps_latitude', 'vehicle_gps_longitude', 'fuel_consumption_rate', 'eta_variation_hours', 'traffic_congestion_level', 'warehouse_inventory_level', 'loading_unloading_time', 'handling_equipment_availability', 'order_fulfillment_status', 'weather_condition_severity', 'port_congestion_level', 'shipping_costs', 'supplier_reliability_score', 'lead_time_days', 'historical_demand', 'iot_temperature', 'cargo_condition_status', 'route_risk_level', 'customs_clearance_time', 'driver_behavior_score', 'fatigue_monitoring_score', 'disruption_likelihood_score', 'delay_probability', 'delivery_time_deviation']


In [8]:
# Cleaning Data
from sklearn.preprocessing import MinMaxScaler

df_cleaned = df.drop(columns=['risk_classification'])
df_cleaned = df_cleaned.drop(columns=['timestamp'])

# Handle missing values (remove rows with missing data)
df_cleaned = df_cleaned.dropna()

# Detect and remove outliers using the IQR method
for i in numerical_list:
    Q1 = df_cleaned[i].quantile(0.25)
    Q3 = df_cleaned[i].quantile(0.75)
    IQR = Q3 - Q1
    df_cleaned = df_cleaned[df_cleaned[i] <= (Q3+(1.5*IQR))]
    df_cleaned = df_cleaned[df_cleaned[i] >= (Q1-(1.5*IQR))]
    df_cleaned = df_cleaned.reset_index(drop=True)
# Define acceptable range
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Normalize numerical features using Min-Max Scaling
scaler = MinMaxScaler()
df_cleaned[df_cleaned.columns] = scaler.fit_transform(df_cleaned[df_cleaned.columns])

df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25245 entries, 0 to 25244
Data columns (total 24 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   vehicle_gps_latitude             25245 non-null  float64
 1   vehicle_gps_longitude            25245 non-null  float64
 2   fuel_consumption_rate            25245 non-null  float64
 3   eta_variation_hours              25245 non-null  float64
 4   traffic_congestion_level         25245 non-null  float64
 5   warehouse_inventory_level        25245 non-null  float64
 6   loading_unloading_time           25245 non-null  float64
 7   handling_equipment_availability  25245 non-null  float64
 8   order_fulfillment_status         25245 non-null  float64
 9   weather_condition_severity       25245 non-null  float64
 10  port_congestion_level            25245 non-null  float64
 11  shipping_costs                   25245 non-null  float64
 12  supplier_reliabili

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
#Prepare Data
features = ['fuel_consumption_rate', 'delay_probability', 'traffic_congestion_level', 'route_risk_level', 'loading_unloading_time']
X = df_cleaned[features].to_numpy()
y = df_cleaned['delivery_time_deviation'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

lin_reg_scratch = LinearRegressionScratch() # Import and initialize scikit-learn class for linear regression models
lin_reg_scratch.fit(X_train, y_train) # Train regression model

y_pred = lin_reg_scratch.predict(X_test)

# Evaluation Metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print evaluation results
print("\nModel Evaluation:")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R²): {r2:.4f}")


Model Evaluation:
Mean Absolute Error (MAE): 0.3177
Mean Squared Error (MSE): 0.1305
Root Mean Squared Error (RMSE): 0.3612
R-squared (R²): -0.0792
