# Model training 

## 1. Introduction

This project aims to predict **weld quality** using different Machine Learning (ML) models based on process and material parameters.  

The notebook includes: 
1. Introduction    
2. Setup  
3. Model definitions  
4. Model training and cross-validation    
5. Model comparison   
6. Model interpretation  
7. Final conclusions and recommendations for improving weld quality  


## 2. Setup

In [None]:
# All the import

import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from utils_pre_processing import full_preprocessing


In [None]:
#Load and preprocess data
X_train, X_test, y_train, y_test, feature_names = full_preprocessing("data/welding_data.csv")

## 3. Model definitions

In [None]:
def get_models():
    """Return a dictionary of regression models to evaluate."""
   
    models = {
        'XGBoost': XGBRegressor(
            n_estimators=500, learning_rate=0.05, max_depth=6, subsample=0.8,
            colsample_bytree=0.8, random_state=42, n_jobs=-1, verbosity=0
        ),
        'LightGBM': LGBMRegressor(
            n_estimators=500, learning_rate=0.05, max_depth=6, num_leaves=31,
            subsample=0.8, colsample_bytree=0.8, random_state=42, n_jobs=-1, verbose=-1
        ),
        'Random Forest': RandomForestRegressor(
            n_estimators=500, max_depth=None, min_samples_split=5, min_samples_leaf=2,
            max_features='sqrt', random_state=42, n_jobs=-1, bootstrap=True
        ),
        'Gradient Boosting': GradientBoostingRegressor(
            n_estimators=500, learning_rate=0.05, max_depth=5, subsample=0.8,
            min_samples_split=5, min_samples_leaf=2, random_state=42
        )
    }
    return models

## 4. Model Evaluation

In [None]:
def evaluate_model(model, X_train, X_test, y_train, y_test, name):
    """
    Train and evaluate a single model.

    Returns:
        dict: model performance metrics and timings
    """
    # Training
    start = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start

    # Predictions
    start = time.time()
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    pred_time = time.time() - start

    # Metrics
    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    test_mae = mean_absolute_error(y_test, y_pred_test)
    test_mape = np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100

    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2', n_jobs=-1)

    return {
        'Model': name,
        'Train_R2': train_r2,
        'Test_R2': test_r2,
        'Test_RMSE': test_rmse,
        'Test_MAE': test_mae,
        'Test_MAPE': test_mape,
        'CV_R2_mean': cv_scores.mean(),
        'CV_R2_std': cv_scores.std(),
        'Overfitting': train_r2 - test_r2,
        'Train_time_sec': train_time,
        'Predict_time_sec': pred_time,
        'predictions_test': y_pred_test
    }


## 5. Model Comparison

In [None]:
def compare_models(X_train, X_test, y_train, y_test, target_name='Target'):
    """
    Train and compare multiple regression models.

    Returns:
        results_df: DataFrame with all metrics
        predictions_dict: test predictions for each model
    """
    print(f"\n{'='*80}\nModel Comparison - {target_name}\n{'='*80}")

    models = get_models()
    results, predictions = [], {}

    for name, model in models.items():
        print(f"\n Training {name}...", end=" ")
        try:
            res = evaluate_model(model, X_train, X_test, y_train, y_test, name)
            results.append(res)
            predictions[name] = res['predictions_test']
            print(f"R² = {res['Test_R2']:.4f} | RMSE = {res['Test_RMSE']:.2f}")
        except Exception as e:
            print(f"Error: {e}")

    results_df = pd.DataFrame(results).sort_values('Test_R2', ascending=False).reset_index(drop=True)

    # Display summary
    print("\nResults Summary (sorted by Test R²):")
    print(results_df[['Model', 'Test_R2', 'Test_RMSE', 'Test_MAE', 'CV_R2_mean', 'Overfitting']])

    # Visualization 1: Test R² comparison
    plt.figure(figsize=(10, 5))
    plt.bar(results_df['Model'], results_df['Test_R2'], color='steelblue')
    plt.title(f'Model Comparison - {target_name}')
    plt.ylabel('Test R²')
    plt.xticks(rotation=30)
    plt.tight_layout()
    plt.show()

    # Visualization 2: Overfitting (Train vs Test R²)
    plt.figure(figsize=(8, 6))
    plt.scatter(results_df['Train_R2'], results_df['Test_R2'], s=100)
    plt.plot([0, 1], [0, 1], 'r--', label='Ideal (Train=Test)')
    for _, row in results_df.iterrows():
        plt.annotate(row['Model'], (row['Train_R2'], row['Test_R2']))
    plt.xlabel('Train R²')
    plt.ylabel('Test R²')
    plt.legend()
    plt.title('Overfitting Analysis')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

    results_df.to_csv(f'model_results_{target_name}.csv', index=False)
    print(f"\nResults saved to: model_results_{target_name}.csv")

    return results_df, predictions

In [None]:
results_df, predictions = compare_models(
    X_train, X_test, y_train, y_test, target_name='yield_strength_mpa'
)

## 6. Interpretation

In [None]:
def interpret_results(results_df, target_name='Target'):
    """
    Interpret the best model’s performance using key metrics.
    """
    best = results_df.iloc[0]
    print(f"\n{'='*80}\nResults Interpretation - {best['Model']}\n{'='*80}")
    print(f"R² (Test): {best['Test_R2']:.4f}")
    print(f"RMSE: {best['Test_RMSE']:.2f}")
    print(f"MAE: {best['Test_MAE']:.2f}")
    print(f"MAPE: {best['Test_MAPE']:.2f}%")
    print(f"Cross-Validation R²: {best['CV_R2_mean']:.4f} ± {best['CV_R2_std']:.4f}")
    print(f"Overfitting (Train-Test): {best['Overfitting']:.4f}")

    if best['Overfitting'] > 0.1:
        print(" The model may be overfitting. Try regularization or more data.")
    elif best['Test_MAPE'] < 10:
        print(" Very good prediction accuracy.")
    else:
        print(" Model performance acceptable but could be improved.")

In [None]:
interpret_results(results_df, 'yield_strength_mpa')

In [None]:
# Feature importance
best_model_name = results_df.iloc[0]['Model']
model = get_models()[best_model_name]
model.fit(X_train, y_train)

importances = pd.Series(model.feature_importances_, index=X.columns)
importances.nlargest(10).plot(kind='barh', figsize=(8,5))
plt.title(f"Top Features - {best_model_name}")
plt.show()

 ## 7. Conclusion

### Most appropriate model
Among all tested models, **[insert best_model_name]** achieved the best generalization score
(R² ≈ high, low RMSE and MAE).

### Insights
The most influential process parameters include **[variable1]**, **[variable2]**, **[variable3]**.

