In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from scipy import stats


In [16]:
df=pd.read_csv('C:\\Users\\ommah\\Downloads\\detailed_dummy_lca_dataset_with_patterns.csv')
print(df.head())
print(df.shape)

             Process Stage    Technology Time Period       Location  \
0  Raw Material Extraction  Conventional   2010-2014  North America   
1              End-of-Life      Advanced   2020-2025         Europe   
2  Raw Material Extraction      Emerging   2020-2025  South America   
3            Manufacturing      Emerging   2015-2019  North America   
4                      Use  Conventional   2015-2019  North America   

        Functional Unit Raw Material Type  Raw Material Quantity (kg or unit)  \
0  1 kg Aluminium Sheet        Copper Ore                               2.185   
1  1 kg Aluminium Sheet        Copper Ore                               3.686   
2  1 kg Aluminium Sheet      Copper Scrap                               1.869   
3      1 kg Copper Wire        Copper Ore                               2.149   
4  1 m2 Aluminium Panel   Aluminium Scrap                               3.234   

  Energy Input Type  Energy Input Quantity (MJ) Transport Mode  ...  \
0       Electri

In [17]:
df.columns

Index(['Process Stage', 'Technology', 'Time Period', 'Location',
       'Functional Unit', 'Raw Material Type',
       'Raw Material Quantity (kg or unit)', 'Energy Input Type',
       'Energy Input Quantity (MJ)', 'Transport Mode',
       'Transport Distance (km)', 'Fuel Type', 'Emissions to Air CO2 (kg)',
       'Emissions to Air SOx (kg)', 'Emissions to Air NOx (kg)',
       'Emissions to Air Particulate Matter (kg)',
       'Emissions to Water BOD (kg)', 'Emissions to Water Heavy Metals (kg)',
       'Greenhouse Gas Emissions (kg CO2-eq)', 'Recycled Content (%)',
       'Reuse Potential (%)', 'End-of-Life Treatment', 'Recovery Rate (%)'],
      dtype='object')

In [18]:
print(df.shape)
print(df.isnull().sum())

(10000, 23)
Process Stage                               0
Technology                                  0
Time Period                                 0
Location                                    0
Functional Unit                             0
Raw Material Type                           0
Raw Material Quantity (kg or unit)          0
Energy Input Type                           0
Energy Input Quantity (MJ)                  0
Transport Mode                              0
Transport Distance (km)                     0
Fuel Type                                   0
Emissions to Air CO2 (kg)                   0
Emissions to Air SOx (kg)                   0
Emissions to Air NOx (kg)                   0
Emissions to Air Particulate Matter (kg)    0
Emissions to Water BOD (kg)                 0
Emissions to Water Heavy Metals (kg)        0
Greenhouse Gas Emissions (kg CO2-eq)        0
Recycled Content (%)                        0
Reuse Potential (%)                         0
End-of-Life Treatment 

In [19]:
print(df.shape)
df = df.drop_duplicates()
print(df.shape)

(10000, 23)
(10000, 23)


In [20]:
print(df.dtypes)
print(df.describe())

Process Stage                                object
Technology                                   object
Time Period                                  object
Location                                     object
Functional Unit                              object
Raw Material Type                            object
Raw Material Quantity (kg or unit)          float64
Energy Input Type                            object
Energy Input Quantity (MJ)                  float64
Transport Mode                               object
Transport Distance (km)                     float64
Fuel Type                                    object
Emissions to Air CO2 (kg)                   float64
Emissions to Air SOx (kg)                   float64
Emissions to Air NOx (kg)                   float64
Emissions to Air Particulate Matter (kg)    float64
Emissions to Water BOD (kg)                 float64
Emissions to Water Heavy Metals (kg)        float64
Greenhouse Gas Emissions (kg CO2-eq)        float64
Recycled Con

In [21]:
df_enhanced = df.copy()
df_enhanced['Energy_per_Material'] = df_enhanced['Energy Input Quantity (MJ)'] / (df_enhanced['Raw Material Quantity (kg or unit)'] + 1)
emission_cols = [col for col in df_enhanced.columns if 'Emissions to Air' in col]
df_enhanced['Total_Air_Emissions'] = df_enhanced[emission_cols].sum(axis=1)
water_emission_cols = [col for col in df_enhanced.columns if 'Emissions to Water' in col]
df_enhanced['Total_Water_Emissions'] = df_enhanced[water_emission_cols].sum(axis=1)
circularity_cols = ['Recycled Content (%)', 'Reuse Potential (%)', 'Recovery Rate (%)']
available_circularity_cols = [col for col in circularity_cols if col in df_enhanced.columns]
if len(available_circularity_cols) >= 2:
    df_enhanced['Circularity_Score'] = df_enhanced[available_circularity_cols].mean(axis=1)
df_enhanced['Transport_Intensity'] = df_enhanced['Transport Distance (km)'] / (df_enhanced['Raw Material Quantity (kg or unit)'] + 1)
df_enhanced['GHG_per_Material'] = df_enhanced['Greenhouse Gas Emissions (kg CO2-eq)'] / (df_enhanced['Raw Material Quantity (kg or unit)'] + 1)
df_enhanced['Time_Period_Numeric'] = pd.to_numeric(df_enhanced['Time Period'].str.extract('(\d{4})')[0], errors='coerce')
print(df.shape)
print(df_enhanced.shape)
df=df_enhanced

(10000, 23)
(10000, 30)


In [22]:
df_encoded = df.copy()
label_encoders = {}
categorical_cols = ['Process Stage', 'Technology', 'Location', 'Raw Material Type', 
                          'Energy Input Type', 'Transport Mode', 'Fuel Type','Time Period' ,'Functional Unit' ,'End-of-Life Treatment']
for col in categorical_cols:
    # Optionally label encode if needed elsewhere (not required strictly for XGBoost native categorical support)
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le
    # Convert the column to pandas categorical dtype for XGBoost
    df_encoded[col] = df_encoded[col].astype('category')
print(df_encoded.dtypes)  # To verify categories
df=df_encoded


Process Stage                               category
Technology                                  category
Time Period                                 category
Location                                    category
Functional Unit                             category
Raw Material Type                           category
Raw Material Quantity (kg or unit)           float64
Energy Input Type                           category
Energy Input Quantity (MJ)                   float64
Transport Mode                              category
Transport Distance (km)                      float64
Fuel Type                                   category
Emissions to Air CO2 (kg)                    float64
Emissions to Air SOx (kg)                    float64
Emissions to Air NOx (kg)                    float64
Emissions to Air Particulate Matter (kg)     float64
Emissions to Water BOD (kg)                  float64
Emissions to Water Heavy Metals (kg)         float64
Greenhouse Gas Emissions (kg CO2-eq)         f

In [23]:
# For 'outlier' will implement ICR if performance is not up to the mark 
print(df.columns)
print(df.dtypes)

Index(['Process Stage', 'Technology', 'Time Period', 'Location',
       'Functional Unit', 'Raw Material Type',
       'Raw Material Quantity (kg or unit)', 'Energy Input Type',
       'Energy Input Quantity (MJ)', 'Transport Mode',
       'Transport Distance (km)', 'Fuel Type', 'Emissions to Air CO2 (kg)',
       'Emissions to Air SOx (kg)', 'Emissions to Air NOx (kg)',
       'Emissions to Air Particulate Matter (kg)',
       'Emissions to Water BOD (kg)', 'Emissions to Water Heavy Metals (kg)',
       'Greenhouse Gas Emissions (kg CO2-eq)', 'Recycled Content (%)',
       'Reuse Potential (%)', 'End-of-Life Treatment', 'Recovery Rate (%)',
       'Energy_per_Material', 'Total_Air_Emissions', 'Total_Water_Emissions',
       'Circularity_Score', 'Transport_Intensity', 'GHG_per_Material',
       'Time_Period_Numeric'],
      dtype='object')
Process Stage                               category
Technology                                  category
Time Period                               

In [24]:

# target_cols = ['Recycled Content (%)', 'Reuse Potential (%)', 'Recovery Rate (%)']

# X = df.drop(columns=target_cols)
# y = df[target_cols]

# X_train, X_test, y_train, y_test = train_test_split(
#         X, y, test_size=0.25, random_state=42, shuffle=True
# )
# for target in target_cols:
#     print(f"\nTraining model for target: {target}")
    
#     xgb_model = XGBRegressor(
#         n_estimators=300,  
#         max_depth=7,
#         learning_rate=0.03,  
#         subsample=0.7,
#         colsample_bytree=0.7,
#         reg_alpha=0.1,
#         reg_lambda=1,
#         random_state=42,
#         min_child_weight=50,
#         enable_categorical=True,
#         objective='reg:squarederror',
#         n_jobs=-1
#     )
    
#     # Fit model on categorical feature aware DataFrame
#     xgb_model.fit(X_train, y_train[target])
    
#     # Predictions
#     y_pred_train = xgb_model.predict(X_train)
#     y_pred_test = xgb_model.predict(X_test)
    
#     # Evaluation function
#     def evaluate_model(y_true, y_pred, dataset_name):
#         mse = mean_squared_error(y_true, y_pred)
#         rmse = np.sqrt(mse)
#         mae = mean_absolute_error(y_true, y_pred)
#         r2 = r2_score(y_true, y_pred)
#         mbe = np.mean(y_pred - y_true)
        
#         print(f"\n{dataset_name} Performance for {target}:")
#         print(f"  RMSE: {rmse}")
#         print(f"  MAE:  {mae}")
#         print(f"  R²:   {r2}")
#         print(f"Mean Bias Error: {mbe}")
    
#     # Evaluate
#     evaluate_model(y_train[target], y_pred_train, "Training")
#     evaluate_model(y_test[target], y_pred_test, "Test")


In [88]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Define target columns
target_cols = ['Recycled Content (%)', 'Reuse Potential (%)', 'Recovery Rate (%)']

# Prepare features and targets
X = df.drop(columns=target_cols)
y = df[target_cols]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, shuffle=True
)

# Define individual XGBRegressor instances for each target
model_recycled_content = XGBRegressor(
    n_estimators=200, 
    max_depth=4, 
    learning_rate=0.04, 
    reg_alpha=0.1,
    reg_lambda=1, 
    random_state=42,
    enable_categorical=True, 
    objective='reg:squarederror', 
    n_jobs=-1
)

model_reuse_potential = XGBRegressor(
    n_estimators=400, 
    max_depth=5, 
    learning_rate=0.01,
    reg_alpha=0.1,
    reg_lambda=1, 
    random_state=42,
    enable_categorical=True, 
    objective='reg:squarederror', 
    n_jobs=-1
)

model_recovery_rate = XGBRegressor(
    n_estimators=200, 
    max_depth=4, 
    learning_rate=0.03,
    reg_alpha=0.1,
    reg_lambda=1, 
    random_state=42,
    enable_categorical=True, 
    objective='reg:squarederror', 
    n_jobs=-1
)


def evaluate_model(y_true, y_pred, dataset_name, target):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    mbe = np.mean(y_pred - y_true)
    
    print(f"\n{dataset_name} Performance for {target}:")
    print(f"  RMSE: {rmse}")
    print(f"  MAE:  {mae}")
    print(f"  R²:   {r2}")
    print(f"Mean Bias Error: {mbe}")

# Train, predict and evaluate each model separately
model_recycled_content.fit(X_train, y_train['Recycled Content (%)'])
y_train_pred = model_recycled_content.predict(X_train)
y_test_pred = model_recycled_content.predict(X_test)
# evaluate_model(y_train['Recycled Content (%)'], y_train_pred, "Training", 'Recycled Content (%)')
evaluate_model(y_test['Recycled Content (%)'], y_test_pred, "Test", 'Recycled Content (%)')

model_reuse_potential.fit(X_train, y_train['Reuse Potential (%)'])
y_train_pred = model_reuse_potential.predict(X_train)
y_test_pred = model_reuse_potential.predict(X_test)
# evaluate_model(y_train['Reuse Potential (%)'], y_train_pred, "Training", 'Reuse Potential (%)')
evaluate_model(y_test['Reuse Potential (%)'], y_test_pred, "Test", 'Reuse Potential (%)')

model_recovery_rate.fit(X_train, y_train['Recovery Rate (%)'])
y_train_pred = model_recovery_rate.predict(X_train)
y_test_pred = model_recovery_rate.predict(X_test)
# evaluate_model(y_train['Recovery Rate (%)'], y_train_pred, "Training", 'Recovery Rate (%)')
evaluate_model(y_test['Recovery Rate (%)'], y_test_pred, "Test", 'Recovery Rate (%)')



Test Performance for Recycled Content (%):
  RMSE: 7.10883364925621
  MAE:  5.528830919296265
  R²:   0.9451283466856302
Mean Bias Error: 0.2668598503570558

Test Performance for Reuse Potential (%):
  RMSE: 7.997447371504131
  MAE:  6.302456839664459
  R²:   0.8730064194518077
Mean Bias Error: -0.275353311870575

Test Performance for Recovery Rate (%):
  RMSE: 3.355649229048651
  MAE:  2.64380835647583
  R²:   0.9648631652658597
Mean Bias Error: 0.055326072296142594


In [89]:
# After training each model, save it as a JSON file
model_recycled_content.save_model('model_recycled_content.json')
model_reuse_potential.save_model('model_reuse_potential.json')
model_recovery_rate.save_model('model_recovery_rate.json')

print("Models saved successfully.")

# To load the models later:
# from xgboost import XGBRegressor

# loaded_model_recycled_content = XGBRegressor()
# loaded_model_recycled_content.load_model('model_recycled_content.json')

# loaded_model_reuse_potential = XGBRegressor()
# loaded_model_reuse_potential.load_model('model_reuse_potential.json')

# loaded_model_recovery_rate = XGBRegressor()
# loaded_model_recovery_rate.load_model('model_recovery_rate.json')

# print("Models loaded successfully.")


Models saved successfully.
