In [6]:
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd  
import seaborn as sns
from pmdarima import auto_arima
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
from geopy.extra.rate_limiter import RateLimiter
from geopy.distance import geodesic
from time import sleep
import gdown
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
from geopy.exc import GeocoderTimedOut
from tqdm import tqdm
import pandas as pd
import statsmodels.api as sm

%matplotlib inline

## **Emissions equation calculation**

In [2]:
#Read emissions source data (XPO)
url = "https://docs.google.com/spreadsheets/d/1-bG7NK6B8KqrC3SlxRfmRmZI71SoBOY9/edit?usp=share_link&ouid=101656653267954010446&rtpof=true&sd=true"
# Convert the share link to a direct download link
file_id = url.split("/d/")[1].split("/")[0]
download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
# Download the file using gdown
output = "MIT_Copy of XPO_RXO input CO2_v2_2020_2 - Eco_transit_data (1)_OUT_258.xlsx"
gdown.download(download_url, output, quiet=False)
df = pd.read_excel("MIT_Copy of XPO_RXO input CO2_v2_2020_2 - Eco_transit_data (1)_OUT_258.xlsx")
df.head()

Downloading...
From: https://drive.google.com/uc?export=download&id=1-bG7NK6B8KqrC3SlxRfmRmZI71SoBOY9
To: /Users/tejveero/Library/CloudStorage/GoogleDrive-tejveerobr@gmail.com/My Drive/SCM.800/Cost Inference/MIT_Copy of XPO_RXO input CO2_v2_2020_2 - Eco_transit_data (1)_OUT_258.xlsx
100%|██████████| 3.25M/3.25M [00:00<00:00, 12.3MB/s]


Unnamed: 0,line_number,sid,error_type,error_code,error_message,calculation_module,transport_date,cargo_weight,cargo_unit,cargo_tons_per_teu,...,pre_location_code,post_transport_mode,post_location_type,post_location_code,destination_location_type,destination_location_code,Connection,main_cooled_transport,freight_weight,weight_unit
0,2,102600068675,,,,calculation_mod,2024-07-01,2.285375,ton,10,...,,,,,City,GROVEPORT,Direct,no,2285.374758,kg
1,3,102600068770,,,,calculation_mod,2024-07-01,5.443104,ton,10,...,,,,,City,HILLSBORO,Direct,no,5443.104,kg
2,4,102600068774,,,,calculation_mod,2024-07-01,5.443104,ton,10,...,,,,,City,HILLSBORO,Direct,no,5443.104,kg
3,5,102600068792,,,,calculation_mod,2024-07-01,6.80388,ton,10,...,,,,,City,VACAVILLE,Direct,no,6803.88,kg
4,6,102600068936,,,,calculation_mod,2024-07-01,3.728526,ton,10,...,,,,,City,LOUISVILLE,Direct,no,3728.52624,kg


In [3]:
# Read the Excel file
df_raw = pd.read_excel(output)

# Define common processing function
def process_emissions_data(df, mode):
    # Filter by transport mode
    df = df[df['transport_mode'] == mode]
    
    # Keep only necessary columns
    columns_to_keep = [
        'cargo_weight',
        'transport_mode',
        'co2_equivalent_[t]_ttw',
        'co2_p_tkm_[GrammePerTonneKilometers]_ttw',
        'distances_[km]',
    ]
    df = df[columns_to_keep].dropna()

    # Remove outliers
    for col in ['cargo_weight', 'co2_equivalent_[t]_ttw', 'co2_p_tkm_[GrammePerTonneKilometers]_ttw', 'distances_[km]']:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower) & (df[col] <= upper)]

    # Convert units
    df['distances_[mi]'] = df['distances_[km]'] * 0.621371
    df['cargo_weight_[lbs]'] = df['cargo_weight'] * 2204.62
    df['co2_equivalent_[lbs]_ttw'] = df['co2_equivalent_[t]_ttw'] * 2204.62
    df['co2_p_lbmi_[gramme_per_lb_mile]_ttw'] = df['co2_p_tkm_[GrammePerTonneKilometers]_ttw'] / 1369.55

    # Run linear regression
    X = df[['cargo_weight_[lbs]', 'distances_[mi]']]
    y = df['co2_equivalent_[lbs]_ttw']
    model = LinearRegression(fit_intercept=False)
    model.fit(X, y)

    # Print results
    intercept = model.intercept_
    coefficients = model.coef_
    r2 = r2_score(y, model.predict(X))
    print(f"\n[{mode.upper()} MODE]")
    print(f"Sample size: {len(df)}")
    print(f"c02 (in lbs) = {intercept:.4f}", end='')
    print(f" + {coefficients[0]:.4f} * cargo weight (in lbs)", end='')
    print(f" + {coefficients[1]:.4f} * distance (in miles)")
    print(f"\nR² Score: {r2:.4f}")

# Process data for each mode
process_emissions_data(df_raw, mode='Air')
process_emissions_data(df_raw, mode='Road')


[AIR MODE]
Sample size: 891
c02 (in lbs) = 0.0000 + 4.0776 * cargo weight (in lbs) + 0.8784 * distance (in miles)

R² Score: 0.8840

[ROAD MODE]
Sample size: 3818
c02 (in lbs) = 0.0000 + 0.0195 * cargo weight (in lbs) + 0.1353 * distance (in miles)

R² Score: 0.2905


## **Transportation cost equation calculation**

RXO (TL/LTL/AIR) File

In [3]:
#Read TL/LTL/AIR transportation cost data (RXO)
url = "https://docs.google.com/spreadsheets/d/1-fBHT-JIuSZ0uWpEOKKzrfg8UEiyhGT-/edit?usp=share_link&ouid=101656653267954010446&rtpof=true&sd=true"
# Convert the share link to a direct download link
file_id = url.split("/d/")[1].split("/")[0]
download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
# Download the file using gdown
output = "ShipmentLevelDetails_RXO2024.xlsx"
gdown.download(download_url, output, quiet=False)
df = pd.read_excel("ShipmentLevelDetails_RXO2024.xlsx")
df

Downloading...
From: https://drive.google.com/uc?export=download&id=1-fBHT-JIuSZ0uWpEOKKzrfg8UEiyhGT-
To: /Users/tejveero/Library/CloudStorage/GoogleDrive-tejveerobr@gmail.com/My Drive/SCM.800/Cost Inference/ShipmentLevelDetails_RXO2024.xlsx
100%|██████████| 1.51M/1.51M [00:00<00:00, 16.0MB/s]


Unnamed: 0,StartDateLocal,Date Hierarchy - Year,Date Hierarchy - Month,EndDateLocal,ShipmentNumber,NumberOfOrders,NumberOfStops,BusinessUnit,BusinessSegment,Division,...,LinehaulBlendedCost,FuelBlendedCost,AccessorialBlendedCost,TotalBlendedAPCost,ClientProvidedServiceProviderCode,HaulType,Insert User,IsGlobalYesOrNo,ShipmentType,SecureResources
0,2024-01-03 00:00:00,2024.0,Jan,2024-01-04,1.026001e+11,1.0,2.0,,NORTH AMERICA,,...,2766.0,117.75,,2883.75,,Short Haul,MDUNCAN,No,TRANSPORT,SECURE RESOURCES_ACCEPTED
1,2024-01-05 00:00:00,2024.0,Jan,2024-01-09,1.026001e+11,1.0,2.0,COM,GLOBAL,,...,0.0,,3486.5,3486.50,,Long Haul,JAUYER,Yes,TRANSPORT,SECURE RESOURCES_PICKUP NOTIFICATION
2,2024-01-05 00:00:00,2024.0,Jan,2024-01-09,1.026001e+11,1.0,2.0,COM,GLOBAL,,...,0.0,,2237.0,2237.00,,Long Haul,JAUYER,Yes,TRANSPORT,SECURE RESOURCES_PICKUP NOTIFICATION
3,2024-01-05 00:00:00,2024.0,Jan,2024-01-09,1.026001e+11,1.0,2.0,COM,GLOBAL,,...,0.0,,989.0,989.00,,Long Haul,JAUYER,Yes,TRANSPORT,SECURE RESOURCES_PICKUP NOTIFICATION
4,2024-01-09 00:00:00,2024.0,Jan,2024-01-09,1.026001e+11,2.0,4.0,COM,NORTH AMERICA,,...,1190.0,,,1190.00,,Short Haul,JAUYER,No,TRANSPORT,SECURE RESOURCES_ACCEPTED
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4202,2024-12-31 00:00:00,2024.0,Dec,2025-01-03,1.026001e+11,1.0,2.0,COM,GLOBAL,,...,1.0,,,1.00,,Long Haul,JAUYER,Yes,TRANSPORT,SECURE RESOURCES_PICKUP NOTIFICATION
4203,2024-12-31 00:00:00,2024.0,Dec,2025-01-03,1.026001e+11,1.0,2.0,COM,GLOBAL,,...,1.0,,,1.00,,Long Haul,JAUYER,Yes,TRANSPORT,SECURE RESOURCES_PICKUP NOTIFICATION
4204,2024-12-31 00:00:00,2024.0,Dec,2025-01-03,1.026001e+11,1.0,2.0,COM,GLOBAL,,...,1.0,,,1.00,,Long Haul,JAUYER,Yes,TRANSPORT,SECURE RESOURCES_PICKUP NOTIFICATION
4205,,,,NaT,,,,,,,...,,,,,,,,,,


In [4]:
# 1. Keep only desired columns
columns_to_keep = ['TotalMiles', 'ModeNumber', 'TotalWeight_KG', 
                   'TotalBlendedAPCost', 'SourcePostalCode', 
                   'DestinationPostalCode', 'LinehaulBlendedCost']
df_cleaned = df[columns_to_keep].copy()

# 2. Drop rows with nulls
df_cleaned.dropna(inplace=True)

# 3. Drop rows with zero values in key numeric columns
numeric_cols = ['TotalMiles', 'TotalWeight_KG', 'TotalBlendedAPCost', 'LinehaulBlendedCost']
for col in numeric_cols:
    df_cleaned = df_cleaned[df_cleaned[col] > 0]

# 4. Split into 3 DataFrames by ModeNumber before cleaning
df_air = df_cleaned[df_cleaned['ModeNumber'] == 'AIR'].copy()
df_ltl = df_cleaned[df_cleaned['ModeNumber'] == 'LTL'].copy()
df_tl  = df_cleaned[df_cleaned['ModeNumber'] == 'TL'].copy()

# 5. Function to remove outliers and convert weight to lbs
def clean_and_convert(df_mode):
    outlier_cols = ['TotalWeight_KG', 'TotalMiles', 'TotalBlendedAPCost', 'LinehaulBlendedCost']
    for col in outlier_cols:
        q1 = df_mode[col].quantile(0.25)
        q3 = df_mode[col].quantile(0.75)
        iqr = q3 - q1
        lower = q1 - 1.5 * iqr
        upper = q3 + 1.5 * iqr
        df_mode = df_mode[(df_mode[col] >= lower) & (df_mode[col] <= upper)]
    
    df_mode['TotalWeight_LBS'] = df_mode['TotalWeight_KG'] * 2.20462
    return df_mode

# 6. Clean and convert each mode-specific DataFrame
df_air = clean_and_convert(df_air)
df_ltl = clean_and_convert(df_ltl)
df_tl  = clean_and_convert(df_tl)

In [12]:
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import r2_score

# Helper function to run regression and print results
def run_regression(df_mode, mode_name):
    X = df_mode[['TotalMiles', 'TotalWeight_LBS']]
    y = df_mode['LinehaulBlendedCost']
    #y = df_mode['TotalBlendedAPCost']
    
    model = LinearRegression()
    model.fit(X, y)
    
    intercept = model.intercept_
    coef = model.coef_
    y_pred = model.predict(X)
    r2 = r2_score(y, y_pred)

    print(f"\n[{mode_name} MODE]")
    print(f"Sample size: {len(df_mode)}")
    print(f"Cost equation = {intercept:.4f}", end='')
    print(f" + {coef[0]:.4f} * TotalMiles", end='')
    print(f" + {coef[1]:.4f} * TotalWeight_LBS")
    print(f"R² Score: {r2:.4f}")

# Run regressions for each mode
run_regression(df_air, "AIR")
run_regression(df_ltl, "LTL")
run_regression(df_tl, "TL")


[AIR MODE]
Sample size: 268
Cost equation = 755.5454 + 0.2035 * TotalMiles + -0.0024 * TotalWeight_LBS
R² Score: 0.0584

[LTL MODE]
Sample size: 402
Cost equation = 111.0641 + 0.0488 * TotalMiles + 0.1000 * TotalWeight_LBS
R² Score: 0.3998

[TL MODE]
Sample size: 1918
Cost equation = 1073.3559 + 1.7653 * TotalMiles + 0.0028 * TotalWeight_LBS
R² Score: 0.3298


In [None]:
# Helper function to run regression using statsmodels
def run_ols_statsmodel(df_mode, mode_name):
    X = df_mode[['TotalMiles', 'TotalWeight_LBS']]
    y = df_mode['LinehaulBlendedCost']
    
    X = sm.add_constant(X)  # Adds intercept term
    model = sm.OLS(y, X).fit()
    
    print(f"\n[{mode_name} MODE]")
    print(f"Sample size: {len(df_mode)}")
    print(model.summary())

In [17]:
run_ols_statsmodel(df_ltl, "ROAD")


[ROAD MODE]
Sample size: 402
                             OLS Regression Results                            
Dep. Variable:     LinehaulBlendedCost   R-squared:                       0.400
Model:                             OLS   Adj. R-squared:                  0.397
Method:                  Least Squares   F-statistic:                     132.9
Date:                 Wed, 14 May 2025   Prob (F-statistic):           5.80e-45
Time:                         13:00:01   Log-Likelihood:                -2387.1
No. Observations:                  402   AIC:                             4780.
Df Residuals:                      399   BIC:                             4792.
Df Model:                            2                                         
Covariance Type:             nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
co

In [18]:
run_ols_statsmodel(df_air, "AIR")


[AIR MODE]
Sample size: 268
                             OLS Regression Results                            
Dep. Variable:     LinehaulBlendedCost   R-squared:                       0.058
Model:                             OLS   Adj. R-squared:                  0.051
Method:                  Least Squares   F-statistic:                     8.220
Date:                 Wed, 14 May 2025   Prob (F-statistic):           0.000344
Time:                         13:02:25   Log-Likelihood:                -2346.6
No. Observations:                  268   AIC:                             4699.
Df Residuals:                      265   BIC:                             4710.
Df Model:                            2                                         
Covariance Type:             nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
con

In [None]:
run_ols_statsmodel(df_tl, "TL")