In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from scipy import stats


In [2]:

# Go one level up, then into data folder
file_path = os.path.join("..", "data", "lca_dataset.csv")

if os.path.exists(file_path):
    df = pd.read_csv(file_path)
    print("File loaded successfully!")
    display(df.head())
else:
    print("File not found at:", file_path)


File loaded successfully!


Unnamed: 0,Process Stage,Technology,Time Period,Location,Functional Unit,Raw Material Type,Raw Material Quantity (kg or unit),Energy Input Type,Energy Input Quantity (MJ),Processing Method,...,GHG_per_Material,Time_Period_Numeric,Total_Cost,Circularity_Score,Circular_Economy_Index,Recycled Content (%),Resource Efficiency (%),Extended Product Life (years),Recovery Rate (%),Reuse Potential (%)
0,Transport,Conventional,2020-2025,South America,1 kg Copper Wire,Aluminium Scrap,1000,Electricity,2289.61,Conventional,...,1.08,2023,3763.25,50.66,0.51,74.52,74.77,26.8,9.0,0.09
1,Use,Emerging,2015-2019,Asia,1 m2 Aluminium Panel,Aluminium Ore,500,Electricity,7368.72,Emerging,...,1.05,2017,2063.83,26.93,0.27,10.0,11.83,46.4,93.64,25.18
2,Manufacturing,Advanced,2020-2025,North America,1 kg Copper Wire,Aluminium Scrap,1000,Coal,1586.35,Advanced,...,1.65,2023,2613.4,99.3,0.99,81.46,82.33,101.6,66.04,73.52
3,Use,Emerging,2010-2014,North America,1 m2 Aluminium Panel,Aluminium Ore,500,Natural Gas,7448.21,Emerging,...,6.46,2023,1995.41,23.49,0.23,10.0,12.3,69.3,85.74,39.43
4,Use,Conventional,2015-2019,South America,1 kg Aluminium Sheet,Aluminium Scrap,1000,Coal,1470.09,Conventional,...,2.13,2017,2565.81,100.0,1.0,75.85,76.84,23.1,62.67,90.2


In [3]:
df.shape

(25000, 45)

In [4]:
df.describe

<bound method NDFrame.describe of        Process Stage    Technology Time Period       Location  \
0          Transport  Conventional   2020-2025  South America   
1                Use      Emerging   2015-2019           Asia   
2      Manufacturing      Advanced   2020-2025  North America   
3                Use      Emerging   2010-2014  North America   
4                Use  Conventional   2015-2019  South America   
...              ...           ...         ...            ...   
24995  Manufacturing  Conventional   2010-2014  South America   
24996    End-of-Life      Emerging   2015-2019  South America   
24997            Use  Conventional   2020-2025  South America   
24998            Use  Conventional   2020-2025           Asia   
24999      Transport  Conventional   2010-2014           Asia   

            Functional Unit Raw Material Type  \
0          1 kg Copper Wire   Aluminium Scrap   
1      1 m2 Aluminium Panel     Aluminium Ore   
2          1 kg Copper Wire   Aluminiu

In [5]:
df.columns

Index(['Process Stage', 'Technology', 'Time Period', 'Location',
       'Functional Unit', 'Raw Material Type',
       'Raw Material Quantity (kg or unit)', 'Energy Input Type',
       'Energy Input Quantity (MJ)', 'Processing Method', 'Transport Mode',
       'Transport Distance (km)', 'Fuel Type', 'Metal Quality Grade',
       'Material Scarcity Level', 'Material Cost (USD)',
       'Processing Cost (USD)', 'Emissions to Air CO2 (kg)',
       'Emissions to Air SOx (kg)', 'Emissions to Air NOx (kg)',
       'Emissions to Air Particulate Matter (kg)',
       'Emissions to Water Acid Mine Drainage (kg)',
       'Emissions to Water Heavy Metals (kg)', 'Emissions to Water BOD (kg)',
       'Greenhouse Gas Emissions (kg CO2-eq)', 'Scope 1 Emissions (kg CO2-eq)',
       'Scope 2 Emissions (kg CO2-eq)', 'Scope 3 Emissions (kg CO2-eq)',
       'End-of-Life Treatment', 'Environmental Impact Score',
       'Metal Recyclability Factor', 'Energy_per_Material',
       'Total_Air_Emissions', 'Total

In [6]:
df.isna().sum()

Process Stage                                 0
Technology                                    0
Time Period                                   0
Location                                      0
Functional Unit                               0
Raw Material Type                             0
Raw Material Quantity (kg or unit)            0
Energy Input Type                             0
Energy Input Quantity (MJ)                    0
Processing Method                             0
Transport Mode                                0
Transport Distance (km)                       0
Fuel Type                                     0
Metal Quality Grade                           0
Material Scarcity Level                       0
Material Cost (USD)                           0
Processing Cost (USD)                         0
Emissions to Air CO2 (kg)                     0
Emissions to Air SOx (kg)                     0
Emissions to Air NOx (kg)                     0
Emissions to Air Particulate Matter (kg)

In [7]:
df.dtypes

Process Stage                                  object
Technology                                     object
Time Period                                    object
Location                                       object
Functional Unit                                object
Raw Material Type                              object
Raw Material Quantity (kg or unit)              int64
Energy Input Type                              object
Energy Input Quantity (MJ)                    float64
Processing Method                              object
Transport Mode                                 object
Transport Distance (km)                       float64
Fuel Type                                      object
Metal Quality Grade                            object
Material Scarcity Level                        object
Material Cost (USD)                           float64
Processing Cost (USD)                         float64
Emissions to Air CO2 (kg)                     float64
Emissions to Air SOx (kg)   

In [8]:
print(df.shape)
df = df.drop_duplicates()
print(df.shape)

(25000, 45)
(25000, 45)


In [9]:
df['Metal Quality Grade']

0           Low
1           Low
2          High
3          High
4        Medium
          ...  
24995      High
24996      High
24997       Low
24998       Low
24999      High
Name: Metal Quality Grade, Length: 25000, dtype: object

In [10]:
#Calculating median values for reference 
medians = df.select_dtypes(include=['int64', 'float64']).median()

print("Median values of numeric columns:")
print(medians)

Median values of numeric columns:
Raw Material Quantity (kg or unit)            1000.0000
Energy Input Quantity (MJ)                    6898.9150
Transport Distance (km)                       1020.2400
Material Cost (USD)                           1203.9000
Processing Cost (USD)                          899.0850
Emissions to Air CO2 (kg)                     2401.4550
Emissions to Air SOx (kg)                       17.6565
Emissions to Air NOx (kg)                       14.1255
Emissions to Air Particulate Matter (kg)         8.8280
Emissions to Water Acid Mine Drainage (kg)       3.8306
Emissions to Water Heavy Metals (kg)             2.2983
Emissions to Water BOD (kg)                      1.5322
Greenhouse Gas Emissions (kg CO2-eq)          4002.4250
Scope 1 Emissions (kg CO2-eq)                 2001.2150
Scope 2 Emissions (kg CO2-eq)                 1200.7250
Scope 3 Emissions (kg CO2-eq)                  836.6300
Environmental Impact Score                      66.6500
Metal Recyclab

In [11]:
df_encoded = df.copy()
label_encoders = {}
categorical_cols = ['Process Stage', 'Technology', 'Location', 'Raw Material Type', 
                          'Energy Input Type', 'Transport Mode', 'Fuel Type','Time Period' ,'Functional Unit' ,'End-of-Life Treatment','Processing Method','Metal Quality Grade','Material Scarcity Level']
for col in categorical_cols:
    # Optionally label encode if needed elsewhere (not required strictly for XGBoost native categorical support)
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le
    # Convert the column to pandas categorical dtype for XGBoost
    df_encoded[col] = df_encoded[col].astype('category')
print(df_encoded.dtypes)  # To verify categories
df=df_encoded


Process Stage                                 category
Technology                                    category
Time Period                                   category
Location                                      category
Functional Unit                               category
Raw Material Type                             category
Raw Material Quantity (kg or unit)               int64
Energy Input Type                             category
Energy Input Quantity (MJ)                     float64
Processing Method                             category
Transport Mode                                category
Transport Distance (km)                        float64
Fuel Type                                     category
Metal Quality Grade                           category
Material Scarcity Level                       category
Material Cost (USD)                            float64
Processing Cost (USD)                          float64
Emissions to Air CO2 (kg)                      float64
Emissions 

In [12]:
target_cols = ['Recycled Content (%)', 'Resource Efficiency (%)', 'Extended Product Life (years)', 'Recovery Rate (%)', 'Reuse Potential (%)']

# Prepare features and targets
X = df.drop(columns=target_cols)
y = df[target_cols]

print("Input Variables: ",X.columns,"\n Target variables: ",y.columns)

Input Variables:  Index(['Process Stage', 'Technology', 'Time Period', 'Location',
       'Functional Unit', 'Raw Material Type',
       'Raw Material Quantity (kg or unit)', 'Energy Input Type',
       'Energy Input Quantity (MJ)', 'Processing Method', 'Transport Mode',
       'Transport Distance (km)', 'Fuel Type', 'Metal Quality Grade',
       'Material Scarcity Level', 'Material Cost (USD)',
       'Processing Cost (USD)', 'Emissions to Air CO2 (kg)',
       'Emissions to Air SOx (kg)', 'Emissions to Air NOx (kg)',
       'Emissions to Air Particulate Matter (kg)',
       'Emissions to Water Acid Mine Drainage (kg)',
       'Emissions to Water Heavy Metals (kg)', 'Emissions to Water BOD (kg)',
       'Greenhouse Gas Emissions (kg CO2-eq)', 'Scope 1 Emissions (kg CO2-eq)',
       'Scope 2 Emissions (kg CO2-eq)', 'Scope 3 Emissions (kg CO2-eq)',
       'End-of-Life Treatment', 'Environmental Impact Score',
       'Metal Recyclability Factor', 'Energy_per_Material',
       'Total_Air_

In [15]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Define target columns
target_cols = ['Recycled Content (%)', 'Resource Efficiency (%)', 'Extended Product Life (years)', 'Recovery Rate (%)', 'Reuse Potential (%)']

# Prepare features and targets
X = df.drop(columns=target_cols)
y = df[target_cols]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, shuffle=True
)

# Define individual XGBRegressor instances for each target
model_recycled_content = XGBRegressor(
    n_estimators=200, 
    max_depth=4, 
    learning_rate=0.04, 
    reg_alpha=0.1,
    reg_lambda=1, 
    random_state=42,
    enable_categorical=True, 
    objective='reg:squarederror', 
    n_jobs=-1
)

model_resource_efficiency = XGBRegressor(
    n_estimators=200, 
    max_depth=4, 
    learning_rate=0.04, 
    reg_alpha=0.1,
    reg_lambda=1, 
    random_state=42,
    enable_categorical=True, 
    objective='reg:squarederror', 
    n_jobs=-1
)

model_extended_product_life = XGBRegressor(
    n_estimators=200, 
    max_depth=4, 
    learning_rate=0.04, 
    reg_alpha=0.1,
    reg_lambda=1, 
    random_state=42,
    enable_categorical=True, 
    objective='reg:squarederror', 
    n_jobs=-1
)

model_recovery_rate = XGBRegressor(
    n_estimators=200, 
    max_depth=4, 
    learning_rate=0.03,
    reg_alpha=0.1,
    reg_lambda=1, 
    random_state=42,
    enable_categorical=True, 
    objective='reg:squarederror', 
    n_jobs=-1
)

model_reuse_potential = XGBRegressor(
    n_estimators=300, 
    max_depth=6, 
    learning_rate=0.1,
    reg_alpha=0.1,
    reg_lambda=1, 
    random_state=42,
    enable_categorical=True, 
    objective='reg:squarederror', 
    n_jobs=-1
)

def evaluate_model(y_true, y_pred, dataset_name, target):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    mbe = np.mean(y_pred - y_true)
    
    print(f"\n{dataset_name} Performance for {target}:")
    print(f"  RMSE: {rmse}")
    print(f"  MAE:  {mae}")
    print(f"  R²:   {r2}")
    print(f"Mean Bias Error: {mbe}")

# Train, predict and evaluate each model separately
model_recycled_content.fit(X_train, y_train['Recycled Content (%)'])
y_train_pred = model_recycled_content.predict(X_train)
y_test_pred = model_recycled_content.predict(X_test)
# evaluate_model(y_train['Recycled Content (%)'], y_train_pred, "Training", 'Recycled Content (%)')
evaluate_model(y_test['Recycled Content (%)'], y_test_pred, "Test", 'Recycled Content (%)')


model_resource_efficiency.fit(X_train, y_train['Resource Efficiency (%)'])
y_train_pred = model_resource_efficiency.predict(X_train)
y_test_pred = model_resource_efficiency.predict(X_test)
# evaluate_model(y_train['Resource Efficiency (%)'], y_train_pred, "Training", 'Resource Efficiency (%)')
evaluate_model(y_test['Resource Efficiency (%)'], y_test_pred, "Test", 'Resource Efficiency (%)')

model_extended_product_life.fit(X_train, y_train['Extended Product Life (years)'])
y_train_pred = model_extended_product_life.predict(X_train)
y_test_pred = model_extended_product_life.predict(X_test)
# evaluate_model(y_train['Extended Product Life (years)'], y_train_pred, "Training", 'Extended Product Life (years)')
evaluate_model(y_test['Extended Product Life (years)'], y_test_pred, "Test", 'Extended Product Life (years)')

model_recovery_rate.fit(X_train, y_train['Recovery Rate (%)'])
y_train_pred = model_recovery_rate.predict(X_train)
y_test_pred = model_recovery_rate.predict(X_test)
# evaluate_model(y_train['Recovery Rate (%)'], y_train_pred, "Training", 'Recovery Rate (%)')
evaluate_model(y_test['Recovery Rate (%)'], y_test_pred, "Test", 'Recovery Rate (%)')

model_reuse_potential.fit(X_train, y_train['Reuse Potential (%)'])
y_train_pred = model_reuse_potential.predict(X_train)
y_test_pred = model_reuse_potential.predict(X_test)
# evaluate_model(y_train['Reuse Potential (%)'], y_train_pred, "Training", 'Reuse Potential (%)')
evaluate_model(y_test['Reuse Potential (%)'], y_test_pred, "Test", 'Reuse Potential (%)')


Test Performance for Recycled Content (%):
  RMSE: 0.20083584451086972
  MAE:  0.1265355859985352
  R²:   0.9999717242038311
Mean Bias Error: 0.004999655334472716

Test Performance for Resource Efficiency (%):
  RMSE: 2.892891976865099
  MAE:  2.5004336715087887
  R²:   0.9941637223078603
Mean Bias Error: -0.1012041108520508

Test Performance for Extended Product Life (years):
  RMSE: 2.5848666167822105
  MAE:  1.7336566956787112
  R²:   0.9894191250813024
Mean Bias Error: -0.009630124389648508

Test Performance for Recovery Rate (%):
  RMSE: 4.196637725905383
  MAE:  3.4697622606872556
  R²:   0.9854020646555292
Mean Bias Error: -0.036908019122314464

Test Performance for Reuse Potential (%):
  RMSE: 4.8865585169669945
  MAE:  3.374364822450147
  R²:   0.9786077338005973
Mean Bias Error: 0.08242620783702359


In [16]:
model_dir = os.path.join("..", "model")

model_recycled_content.save_model(os.path.join(model_dir, "model_recycled_content.json"))
model_resource_efficiency.save_model(os.path.join(model_dir, "model_resource_efficiency.json"))
model_extended_product_life.save_model(os.path.join(model_dir, "model_extended_product_life.json"))
model_recovery_rate.save_model(os.path.join(model_dir, "model_recovery_rate.json"))
model_reuse_potential.save_model(os.path.join(model_dir, "model_reuse_potential.json"))

print(f"✅ All models saved in: {os.path.abspath(model_dir)}")


✅ All models saved in: C:\Users\ommah\Python_om_eng\Git\ml_Alloyance-2\model
