In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from scipy import stats


df=pd.read_csv('C:\\Users\\ommah\\Downloads\\detailed_dummy_lca_dataset_with_patterns.csv')
print(df.head())
print(df.shape)

df.columns

print(df.shape)
print(df.isnull().sum())

print(df.shape)
df = df.drop_duplicates()
print(df.shape)

print(df.dtypes)
print(df.describe())

df_enhanced = df.copy()
df_enhanced['Energy_per_Material'] = df_enhanced['Energy Input Quantity (MJ)'] / (df_enhanced['Raw Material Quantity (kg or unit)'] + 1)
emission_cols = [col for col in df_enhanced.columns if 'Emissions to Air' in col]
df_enhanced['Total_Air_Emissions'] = df_enhanced[emission_cols].sum(axis=1)
water_emission_cols = [col for col in df_enhanced.columns if 'Emissions to Water' in col]
df_enhanced['Total_Water_Emissions'] = df_enhanced[water_emission_cols].sum(axis=1)
circularity_cols = ['Recycled Content (%)', 'Reuse Potential (%)', 'Recovery Rate (%)']
available_circularity_cols = [col for col in circularity_cols if col in df_enhanced.columns]
if len(available_circularity_cols) >= 2:
    df_enhanced['Circularity_Score'] = df_enhanced[available_circularity_cols].mean(axis=1)
df_enhanced['Transport_Intensity'] = df_enhanced['Transport Distance (km)'] / (df_enhanced['Raw Material Quantity (kg or unit)'] + 1)
df_enhanced['GHG_per_Material'] = df_enhanced['Greenhouse Gas Emissions (kg CO2-eq)'] / (df_enhanced['Raw Material Quantity (kg or unit)'] + 1)
df_enhanced['Time_Period_Numeric'] = pd.to_numeric(df_enhanced['Time Period'].str.extract('(\d{4})')[0], errors='coerce')
print(df.shape)
print(df_enhanced.shape)
df=df_enhanced

df_encoded = df.copy()
label_encoders = {}
categorical_cols = ['Process Stage', 'Technology', 'Location', 'Raw Material Type', 
                          'Energy Input Type', 'Transport Mode', 'Fuel Type','Time Period' ,'Functional Unit' ,'End-of-Life Treatment']
for col in categorical_cols:
    # Optionally label encode if needed elsewhere (not required strictly for XGBoost native categorical support)
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le
    # Convert the column to pandas categorical dtype for XGBoost
    df_encoded[col] = df_encoded[col].astype('category')
print(df_encoded.dtypes)  # To verify categories
df=df_encoded


# For 'outlier' will implement ICR if performance is not up to the mark 
print(df.columns)
print(df.dtypes)


# target_cols = ['Recycled Content (%)', 'Reuse Potential (%)', 'Recovery Rate (%)']

# X = df.drop(columns=target_cols)
# y = df[target_cols]

# X_train, X_test, y_train, y_test = train_test_split(
#         X, y, test_size=0.25, random_state=42, shuffle=True
# )
# for target in target_cols:
#     print(f"\nTraining model for target: {target}")
    
#     xgb_model = XGBRegressor(
#         n_estimators=300,  
#         max_depth=7,
#         learning_rate=0.03,  
#         subsample=0.7,
#         colsample_bytree=0.7,
#         reg_alpha=0.1,
#         reg_lambda=1,
#         random_state=42,
#         min_child_weight=50,
#         enable_categorical=True,
#         objective='reg:squarederror',
#         n_jobs=-1
#     )
    
#     # Fit model on categorical feature aware DataFrame
#     xgb_model.fit(X_train, y_train[target])
    
#     # Predictions
#     y_pred_train = xgb_model.predict(X_train)
#     y_pred_test = xgb_model.predict(X_test)
    
#     # Evaluation function
#     def evaluate_model(y_true, y_pred, dataset_name):
#         mse = mean_squared_error(y_true, y_pred)
#         rmse = np.sqrt(mse)
#         mae = mean_absolute_error(y_true, y_pred)
#         r2 = r2_score(y_true, y_pred)
#         mbe = np.mean(y_pred - y_true)
        
#         print(f"\n{dataset_name} Performance for {target}:")
#         print(f"  RMSE: {rmse}")
#         print(f"  MAE:  {mae}")
#         print(f"  R²:   {r2}")
#         print(f"Mean Bias Error: {mbe}")
    
#     # Evaluate
#     evaluate_model(y_train[target], y_pred_train, "Training")
#     evaluate_model(y_test[target], y_pred_test, "Test")


from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Define target columns
target_cols = ['Recycled Content (%)', 'Reuse Potential (%)', 'Recovery Rate (%)']

# Prepare features and targets
X = df.drop(columns=target_cols)
y = df[target_cols]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, shuffle=True
)

# Define individual XGBRegressor instances for each target
model_recycled_content = XGBRegressor(
    n_estimators=200, 
    max_depth=4, 
    learning_rate=0.04, 
    reg_alpha=0.1,
    reg_lambda=1, 
    random_state=42,
    enable_categorical=True, 
    objective='reg:squarederror', 
    n_jobs=-1
)

model_reuse_potential = XGBRegressor(
    n_estimators=400, 
    max_depth=5, 
    learning_rate=0.01,
    reg_alpha=0.1,
    reg_lambda=1, 
    random_state=42,
    enable_categorical=True, 
    objective='reg:squarederror', 
    n_jobs=-1
)

model_recovery_rate = XGBRegressor(
    n_estimators=200, 
    max_depth=4, 
    learning_rate=0.03,
    reg_alpha=0.1,
    reg_lambda=1, 
    random_state=42,
    enable_categorical=True, 
    objective='reg:squarederror', 
    n_jobs=-1
)


def evaluate_model(y_true, y_pred, dataset_name, target):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    mbe = np.mean(y_pred - y_true)
    
    print(f"\n{dataset_name} Performance for {target}:")
    print(f"  RMSE: {rmse}")
    print(f"  MAE:  {mae}")
    print(f"  R²:   {r2}")
    print(f"Mean Bias Error: {mbe}")

# Train, predict and evaluate each model separately
model_recycled_content.fit(X_train, y_train['Recycled Content (%)'])
y_train_pred = model_recycled_content.predict(X_train)
y_test_pred = model_recycled_content.predict(X_test)
# evaluate_model(y_train['Recycled Content (%)'], y_train_pred, "Training", 'Recycled Content (%)')
evaluate_model(y_test['Recycled Content (%)'], y_test_pred, "Test", 'Recycled Content (%)')

model_reuse_potential.fit(X_train, y_train['Reuse Potential (%)'])
y_train_pred = model_reuse_potential.predict(X_train)
y_test_pred = model_reuse_potential.predict(X_test)
# evaluate_model(y_train['Reuse Potential (%)'], y_train_pred, "Training", 'Reuse Potential (%)')
evaluate_model(y_test['Reuse Potential (%)'], y_test_pred, "Test", 'Reuse Potential (%)')

model_recovery_rate.fit(X_train, y_train['Recovery Rate (%)'])
y_train_pred = model_recovery_rate.predict(X_train)
y_test_pred = model_recovery_rate.predict(X_test)
# evaluate_model(y_train['Recovery Rate (%)'], y_train_pred, "Training", 'Recovery Rate (%)')
evaluate_model(y_test['Recovery Rate (%)'], y_test_pred, "Test", 'Recovery Rate (%)')


# After training each model, save it as a JSON file
model_recycled_content.save_model('model_recycled_content.json')
model_reuse_potential.save_model('model_reuse_potential.json')
model_recovery_rate.save_model('model_recovery_rate.json')

print("Models saved successfully.")

# To load the models later:
# from xgboost import XGBRegressor

# loaded_model_recycled_content = XGBRegressor()
# loaded_model_recycled_content.load_model('model_recycled_content.json')

# loaded_model_reuse_potential = XGBRegressor()
# loaded_model_reuse_potential.load_model('model_reuse_potential.json')

# loaded_model_recovery_rate = XGBRegressor()
# loaded_model_recovery_rate.load_model('model_recovery_rate.json')

# print("Models loaded successfully.")


