# 🏆 AI-Driven LCA Tool - Hackathon Project
**Goal**: Predict sustainability impact of metallurgical processes using ML

In [None]:
# Install and import libraries
!pip install xgboost shap -q

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import shap
import joblib
from google.colab import files
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
plt.rcParams['figure.figsize'] = (12, 6)
print("✅ Setup complete!")

## 📂 Dataset Upload

In [None]:
# Upload and load dataset
uploaded = files.upload()
df = pd.read_csv('synthetic_LCA.csv')

print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
display(df.head())
display(df.describe())

# Check for missing values
print(f"\nMissing values: {df.isnull().sum().sum()}")
print(f"Categorical features: {df.select_dtypes(include='object').columns.tolist()}")

## 📊 EDA and Visualization

In [None]:
# Target variable analysis
target = 'circularity_index'  # Our sustainability score
print(f"Target: {target} (Range: {df[target].min():.1f}-{df[target].max():.1f})")

# Key visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Target distribution
axes[0,0].hist(df[target], bins=30, alpha=0.7, color='green')
axes[0,0].set_title('Sustainability Score Distribution')

# Key relationships
axes[0,1].scatter(df['recycled_input_frac'], df[target], alpha=0.6)
axes[0,1].set_title('Recycled Input vs Sustainability')

axes[1,0].scatter(df['electricity_kWh'], df[target], alpha=0.6, color='red')
axes[1,0].set_title('Electricity vs Sustainability')

sns.boxplot(data=df, x='metal', y=target, ax=axes[1,1])
axes[1,1].set_title('Sustainability by Metal Type')

plt.tight_layout()
plt.show()

# Correlation analysis
corr_matrix = df.select_dtypes(include=[np.number]).corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='RdBu_r', center=0, fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

# Top correlated features
target_corr = corr_matrix[target].abs().sort_values(ascending=False)
print("\nTop features correlated with sustainability:")
for feature, corr in target_corr.head(6).items():
    if feature != target:
        print(f"• {feature}: {corr:.3f}")

## ⚙️ Feature Engineering

In [None]:
# Create meaningful derived features
df_features = df.copy()

# Efficiency ratios
df_features['energy_intensity'] = df['electricity_kWh'] / df['mass_kg']
df_features['circular_potential'] = (df['recycled_input_frac'] + df['end_of_life_recovery_frac']) / 2
df_features['gwp_per_kg'] = df['GWP_kgCO2e'] / df['mass_kg']
df_features['is_recycled'] = (df['route'] == 'recycled').astype(int)

# Preprocessing
df_features = df_features.fillna(df_features.median(numeric_only=True))
df_encoded = pd.get_dummies(df_features, columns=['metal', 'route', 'transport_mode', 'alloy_grade'])

# Prepare data
X = df_encoded.drop(columns=['circularity_index', 'GWP_kgCO2e', 'energy_MJ'])
y = df_encoded[target]

# Scale features
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

print(f"Final dataset: {X_scaled.shape} features, {len(y)} samples")

## 🤖 Model Training

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train multiple models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(alpha=1.0),
    'Lasso': Lasso(alpha=0.1),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBRegressor(n_estimators=100, random_state=42)
}

results = {}
trained_models = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    results[name] = {
        'MAE': mean_absolute_error(y_test, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
        'R²': r2_score(y_test, y_pred)
    }
    trained_models[name] = model

# Results comparison
results_df = pd.DataFrame(results).T.round(4)
print("🏆 Model Performance:")
display(results_df.sort_values('R²', ascending=False))

best_model_name = results_df['R²'].idxmax()
best_model = trained_models[best_model_name]
print(f"\n🥇 Best Model: {best_model_name} (R² = {results_df.loc[best_model_name, 'R²']:.4f})")

## 📊 Model Evaluation

In [None]:
# Performance visualization
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Model comparison
r2_scores = results_df['R²'].sort_values()
axes[0].barh(range(len(r2_scores)), r2_scores.values)
axes[0].set_yticks(range(len(r2_scores)))
axes[0].set_yticklabels(r2_scores.index)
axes[0].set_title('Model Performance (R² Score)')
axes[0].set_xlabel('R² Score')

# Predicted vs Actual
best_pred = best_model.predict(X_test)
axes[1].scatter(y_test, best_pred, alpha=0.6)
axes[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
axes[1].set_xlabel('Actual')
axes[1].set_ylabel('Predicted')
axes[1].set_title(f'Predicted vs Actual ({best_model_name})')

plt.tight_layout()
plt.show()

## 🔍 Feature Importance & SHAP

In [None]:
# Feature importance
if hasattr(best_model, 'feature_importances_'):
    importance_df = pd.DataFrame({
        'feature': X.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False).head(10)
    
    plt.figure(figsize=(10, 6))
    plt.barh(range(len(importance_df)), importance_df['importance'])
    plt.yticks(range(len(importance_df)), importance_df['feature'])
    plt.title(f'Top 10 Feature Importance ({best_model_name})')
    plt.xlabel('Importance')
    plt.tight_layout()
    plt.show()

# SHAP analysis
if best_model_name in ['Random Forest', 'XGBoost']:
    explainer = shap.TreeExplainer(best_model)
    shap_values = explainer.shap_values(X_test.sample(100, random_state=42))
    
    plt.figure(figsize=(10, 6))
    shap.summary_plot(shap_values, X_test.sample(100, random_state=42), 
                     plot_type="bar", max_display=10, show=False)
    plt.title('SHAP Feature Importance')
    plt.tight_layout()
    plt.show()

    print("Top SHAP features:")
    mean_shap = np.abs(shap_values).mean(axis=0)
    shap_importance = pd.DataFrame({
        'feature': X.columns,
        'shap_value': mean_shap
    }).sort_values('shap_value', ascending=False)
    
    for i, (_, row) in enumerate(shap_importance.head(5).iterrows(), 1):
        print(f"{i}. {row['feature']}: {row['shap_value']:.4f}")

## 💡 Insights & Recommendations

In [None]:
# Key insights
print("🔍 KEY INSIGHTS:")
print("=" * 40)

# Route impact
route_impact = df.groupby('route')[target].mean()
print(f"Recycled route: {route_impact['recycled']:.1f} avg sustainability")
print(f"Raw route: {route_impact['raw']:.1f} avg sustainability")
print(f"Improvement potential: {route_impact['recycled'] - route_impact['raw']:.1f} points")

# Metal comparison
metal_impact = df.groupby('metal')[target].mean()
print(f"\nMetal comparison:")
for metal, score in metal_impact.items():
    print(f"• {metal.title()}: {score:.1f} sustainability score")

print(f"\n🎯 RECOMMENDATIONS:")
print("1. Prioritize recycled content to boost sustainability")
print("2. Optimize energy efficiency (reduce electricity/kg)")
print("3. Improve end-of-life recovery systems")
print("4. Consider renewable energy sources")
print("5. Minimize transport distances")

print(f"\n🤖 MODEL SUMMARY:")
print(f"Best model explains {results_df.loc[best_model_name, 'R²']:.1%} of sustainability variance")
print(f"Prediction accuracy: ±{results_df.loc[best_model_name, 'MAE']:.1f} sustainability points")

## 💾 Export Results

In [None]:
# Save model and results
joblib.dump(best_model, 'best_lca_model.joblib')
joblib.dump(scaler, 'feature_scaler.joblib')
results_df.to_csv('model_results.csv')

print("✅ Files saved:")
print("• best_lca_model.joblib (trained model)")
print("• feature_scaler.joblib (preprocessing)")
print("• model_results.csv (performance metrics)")

# Download files
files.download('best_lca_model.joblib')
files.download('feature_scaler.joblib')
files.download('model_results.csv')

print("\n🏆 HACKATHON PROJECT COMPLETE!")
print("Successfully built AI model for LCA sustainability prediction")