In [None]:
import pandas as pd
import json
import numpy as np
import os
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

# --- Load your real dataset ---
file_path = os.path.join("..", "data", "lca_dataset.csv")

if not os.path.exists(file_path):
    raise FileNotFoundError(f"Dataset not found at {file_path}")

df_training = pd.read_csv(file_path)

# 1. Separate categorical and numeric columns
categorical_cols = df_training.select_dtypes(include=['object']).columns.tolist()
numeric_cols = df_training.select_dtypes(include=[np.number]).columns.tolist()

# 2. Encode all categorical columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_training[col] = le.fit_transform(df_training[col].astype(str))
    label_encoders[col] = le

# 3. Fit Iterative Imputer
imputer = IterativeImputer(
    estimator=RandomForestRegressor(n_estimators=50, random_state=42),
    max_iter=10,
    random_state=0
)
imputer.fit(df_training)

def autofill_lca_data(json_input):
    """
    Takes a JSON input from a user, applies hybrid autofill logic,
    and returns a fully imputed pandas DataFrame.
    """
    # 1. Load the JSON input into a DataFrame
    user_data = json.loads(json_input)
    df_user = pd.DataFrame([user_data])

    # --- Rule-based default example (optional, can add more rules) ---
    if "Process Stage" in df_user.columns and df_user["Process Stage"].iloc[0] == "End-of-Life":
        if "End-of-Life Treatment" not in df_user or pd.isna(df_user["End-of-Life Treatment"]).any():
            df_user["End-of-Life Treatment"] = "Recycling"

    # 2. Align columns with training set
    df_user = df_user.reindex(columns=df_training.columns, fill_value=np.nan)

    # 3. Encode categorical columns
    for col in categorical_cols:
        if col in df_user.columns:
            df_user[col] = df_user[col].apply(
                lambda x: label_encoders[col].transform([x])[0]
                if pd.notna(x) and x in label_encoders[col].classes_
                else np.nan  # use NaN for unknown categories (lets imputer fill)
            )

    # 4. Apply imputation
    imputed_array = imputer.transform(df_user)
    df_imputed = pd.DataFrame(imputed_array, columns=df_user.columns)

    # 5. Decode categorical columns
    for col in categorical_cols:
        df_imputed[col] = df_imputed[col].round().astype(int)
        valid_classes = label_encoders[col].classes_
        df_imputed[col] = df_imputed[col].map(
            lambda x: valid_classes[x] if x < len(valid_classes) else "Unknown"
        )

    return df_imputed

# Example usage
user_json = '''
{
  "Process Stage": "Use",
  "Technology": "Advanced",
  "Location": "Europe",
  "Raw Material Quantity (kg or unit)": null,
  "Energy Input Quantity (MJ)": null,
  "Transport Distance (km)": 500,
  "Emissions to Air CO2 (kg)": null
}
'''

final_df = autofill_lca_data(user_json)
print("Final Autofilled DataFrame:")
print(final_df)
