# AI-Powered Life Cycle Assessment (LCA) Tool for Metals

This notebook implements a comprehensive LCA analysis tool for aluminium and copper production scenarios.

## Table of Contents
1. Generate Synthetic Dataset
2. Data Preprocessing
3. Machine Learning Models
4. Explainability Analysis
5. Results & Reporting

## Import Required Libraries

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Machine learning
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Explainability
import shap

# Model persistence
import pickle

# Set random seed for reproducibility
np.random.seed(42)

# Configure plotting
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (10, 6)

print("All libraries imported successfully!")

## 1. Generate Synthetic Dataset

In [None]:
def generate_synthetic_lca_data(n_samples=2000):
    """
    Generate synthetic LCA dataset for aluminium and copper production scenarios.
    
    Args:
        n_samples (int): Number of samples to generate
    
    Returns:
        pd.DataFrame: Synthetic dataset with features and targets
    """
    print(f"Generating {n_samples} synthetic LCA samples...")
    
    # Initialize data dictionary
    data = {}
    
    # Categorical features
    data['metal'] = np.random.choice(['aluminium', 'copper'], n_samples)
    data['route'] = np.random.choice(['raw', 'recycled'], n_samples)
    data['transport_mode'] = np.random.choice(['truck', 'rail', 'ship'], n_samples)
    
    # Alloy grades based on metal type
    alloy_grades = []
    for metal in data['metal']:
        if metal == 'aluminium':
            alloy_grades.append(np.random.choice(['6061', '1100', '2024', '5052']))
        else:  # copper
            alloy_grades.append(np.random.choice(['Cu-ETP', 'Cu-DHP', 'Cu-OF', 'Cu-PHC']))
    data['alloy_grade'] = alloy_grades
    
    # Numeric features
    data['mass_kg'] = np.random.uniform(500, 5000, n_samples)
    
    # Electricity consumption varies by metal and route
    electricity_values = []
    for i in range(n_samples):
        if data['metal'][i] == 'aluminium':
            base = 1500 if data['route'][i] == 'raw' else 800
        else:  # copper
            base = 1200 if data['route'][i] == 'raw' else 600
        electricity_values.append(np.random.uniform(base-300, base+300))
    data['electricity_kWh'] = np.array(electricity_values)
    
    data['grid_co2_g_per_kWh'] = np.random.uniform(100, 1000, n_samples)
    data['transport_km'] = np.random.uniform(10, 2000, n_samples)
    data['yield_frac'] = np.random.uniform(0.5, 1.0, n_samples)
    data['recycled_input_frac'] = np.random.uniform(0, 1, n_samples)
    data['end_of_life_recovery_frac'] = np.random.uniform(0, 1, n_samples)
    
    # Generate target variables with realistic formulas and noise
    print("Calculating target variables...")
    
    # GWP (Global Warming Potential) in kg CO2 equivalent
    gwp_base = (data['electricity_kWh'] * data['grid_co2_g_per_kWh'] / 1000 + 
                data['transport_km'] * 0.1 - 
                data['recycled_input_frac'] * 100)
    data['GWP_kgCO2e'] = gwp_base + np.random.normal(0, gwp_base * 0.1, n_samples)
    
    # Energy consumption in MJ
    energy_base = (data['electricity_kWh'] * 3.6 + data['transport_km'] * 0.5)
    data['energy_MJ'] = energy_base + np.random.normal(0, energy_base * 0.05, n_samples)
    
    # Circularity index (0-100 scale)
    circularity_base = (data['recycled_input_frac'] * 0.6 + 
                       data['end_of_life_recovery_frac'] * 0.4) * 100
    data['circularity_index'] = circularity_base + np.random.normal(0, 5, n_samples)
    
    # Ensure positive values for targets
    data['GWP_kgCO2e'] = np.maximum(data['GWP_kgCO2e'], 0)
    data['energy_MJ'] = np.maximum(data['energy_MJ'], 0)
    data['circularity_index'] = np.clip(data['circularity_index'], 0, 100)
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    print(f"Dataset generated with shape: {df.shape}")
    return df

# Generate the dataset
df_synthetic = generate_synthetic_lca_data(2000)

# Display basic information
print("\nDataset Info:")
print(df_synthetic.info())
print("\nFirst 5 rows:")
print(df_synthetic.head())

# Save to CSV
df_synthetic.to_csv('synthetic_LCA.csv', index=False)
print("\nDataset saved as 'synthetic_LCA.csv'")

In [None]:
# Display summary statistics
print("Summary Statistics:")
print(df_synthetic.describe())

# Check categorical distributions
print("\nCategorical Feature Distributions:")
categorical_cols = ['metal', 'route', 'transport_mode', 'alloy_grade']
for col in categorical_cols:
    print(f"\n{col}:")
    print(df_synthetic[col].value_counts())