# Labour Cost Index (LCI) Forecasting - Simplified

## Overview
This notebook provides a streamlined pipeline for forecasting Labour Cost Index (LCI) changes using time series analysis and machine learning. It generates predictions for multiple horizons with clear visualizations and confidence intervals.

**Key Features:**
- Automated data preprocessing and feature engineering
- Multiple prediction horizons (1, 3, 6, 9 quarters)
- Interactive visualizations with confidence intervals
- Error handling and robust trial management
- Cross-country economic indicators analysis

## Quick Start
1. Set up your `.env` file with evoML credentials
2. Run all cells in sequence
3. View the final prediction summary and visualizations

## Setup
### Dependencies
- `turintech-evoml-client`
- `pandas`, `numpy`, `matplotlib`, `plotly`
- `python-dotenv`

### Environment Setup
Create a `.env` file in the project root:
```
EVOML_USERNAME=your_username_here
EVOML_PASSWORD=your_password_here
```


In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import evoml_client as ec
from evoml_client.trial_conf_models import BudgetMode, SplitMethodOptions
import os
from dotenv import load_dotenv
from typing import Dict, List, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

# Load environment variables
load_dotenv()

# Configuration
API_URL = "https://evoml.ai"
EVOML_USERNAME = os.getenv("EVOML_USERNAME", "")
EVOML_PASSWORD = os.getenv("EVOML_PASSWORD", "")

# Initialize evoML client
try:
    ec.init(base_url=API_URL, username=EVOML_USERNAME, password=EVOML_PASSWORD)
    print("✅ Successfully connected to evoML platform")
except Exception as e:
    print(f"❌ Failed to connect to evoML: {e}")
    print("Please check your credentials in the .env file")


  from .autonotebook import tqdm as notebook_tqdm


✅ Successfully connected to evoML platform


In [3]:
load_dotenv()

API_URL = "https://evoml.ai"
EVOML_USERNAME = os.getenv("EVOML_USERNAME")
EVOML_PASSWORD = os.getenv("EVOML_PASSWORD")

ec.init(base_url=API_URL, username=EVOML_USERNAME, password=EVOML_PASSWORD)

True

In [4]:
# Load dataset
df = pd.read_csv("../data/processed/economic_indicators_quarterly_yoy.csv")
df["time"] = pd.to_datetime(df["time"])

df

Unnamed: 0,country,time,gdp_pct_change_yoy,u_pct_change,hicp_pct_change,LCI_pct_change
0,Austria,2002-03-31,1.512650,19.014085,1.703131,4.8
1,Austria,2002-06-30,1.839953,2.564103,1.494283,4.4
2,Austria,2002-09-30,1.684049,8.333333,1.611436,5.7
3,Austria,2002-12-31,0.874297,1.785714,1.682412,3.9
4,Austria,2003-03-31,0.081322,5.325444,1.842071,3.5
...,...,...,...,...,...,...
2780,United Kingdom,2019-09-30,1.360822,-6.563422,1.782364,4.1
2781,United Kingdom,2019-12-31,1.209509,-0.932401,1.307190,2.6
2782,United Kingdom,2020-03-31,-2.384954,5.440000,1.495327,5.8
2783,United Kingdom,2020-06-30,-20.799547,6.729264,0.648749,22.3


In [5]:
target = "LCI_pct_change"
lead_num = 3  # Define how many periods ahead to forecast (1 = next quarter, 2 = two quarters ahead, etc.)


def generate_lags(df, max_lags):
    """
    Generate multiple lag features for numeric columns in the dataframe

    Args:
        df: pandas DataFrame
        max_lags: maximum number of lags to generate

    Returns:
        DataFrame with lag features added
    """
    df_copy = df.copy()

    for col in df.columns:
        # Skip lag generation for country, time, and target column
        if col not in ["country", "time", target]:
            for lag in range(1, max_lags + 1):
                # Group by country to ensure lags are calculated within each country
                df_copy[f"{col}_lag_{lag}"] = df_copy.groupby("country")[col].shift(lag)

    return df_copy


# Generate lags up to 4 periods
df = generate_lags(df, max_lags=4)

print(df.head())

   country       time  gdp_pct_change_yoy  u_pct_change  hicp_pct_change  \
0  Austria 2002-03-31            1.512650     19.014085         1.703131   
1  Austria 2002-06-30            1.839953      2.564103         1.494283   
2  Austria 2002-09-30            1.684049      8.333333         1.611436   
3  Austria 2002-12-31            0.874297      1.785714         1.682412   
4  Austria 2003-03-31            0.081322      5.325444         1.842071   

   LCI_pct_change  gdp_pct_change_yoy_lag_1  gdp_pct_change_yoy_lag_2  \
0             4.8                       NaN                       NaN   
1             4.4                  1.512650                       NaN   
2             5.7                  1.839953                  1.512650   
3             3.9                  1.684049                  1.839953   
4             3.5                  0.874297                  1.684049   

   gdp_pct_change_yoy_lag_3  gdp_pct_change_yoy_lag_4  u_pct_change_lag_1  \
0                       NaN

In [6]:
def generate_lead_target(df, lead_periods):
    """
    Generate a lead target column for the dataframe

    Args:
        df: pandas DataFrame
        lead_periods: number of periods ahead to forecast

    Returns:
        DataFrame with lead target column added
    """
    df_copy = df.copy()
    # Group by country to ensure lead is calculated within each country
    df_copy[f"{target}_lead_{lead_periods}"] = df_copy.groupby("country")[target].shift(
        -lead_periods
    )
    return df_copy


df = generate_lead_target(df, lead_periods=lead_num)

lead_target = f"{target}_lead_{lead_num}"

# drop original target column to prevent data leakage.
df = df.drop(columns=[target])

# --- Check if lags exist within the target variable ---
lead_lag_cols = [col for col in df.columns if col.startswith(f"{lead_target}_lag")]
if lead_lag_cols:
    df = df.drop(columns=lead_lag_cols)
    print(f"Dropped lead lag columns: {lead_lag_cols}")


# Sort by time in order to ensure continuous time series and a representations of countries.
df = df.sort_values("time").reset_index(drop=True)
print(f"\nData sorted by time. Date range: {df['time'].min()} to {df['time'].max()}")
print(f"First few dates: {df['time'].head(10).tolist()}")
print(f"Last few dates: {df['time'].tail(10).tolist()}")

# remove rows with NaN values
df = df.dropna()


Data sorted by time. Date range: 2002-03-31 00:00:00 to 2025-03-31 00:00:00
First few dates: [Timestamp('2002-03-31 00:00:00'), Timestamp('2002-03-31 00:00:00'), Timestamp('2002-03-31 00:00:00'), Timestamp('2002-03-31 00:00:00'), Timestamp('2002-03-31 00:00:00'), Timestamp('2002-03-31 00:00:00'), Timestamp('2002-03-31 00:00:00'), Timestamp('2002-03-31 00:00:00'), Timestamp('2002-03-31 00:00:00'), Timestamp('2002-03-31 00:00:00')]
Last few dates: [Timestamp('2025-03-31 00:00:00'), Timestamp('2025-03-31 00:00:00'), Timestamp('2025-03-31 00:00:00'), Timestamp('2025-03-31 00:00:00'), Timestamp('2025-03-31 00:00:00'), Timestamp('2025-03-31 00:00:00'), Timestamp('2025-03-31 00:00:00'), Timestamp('2025-03-31 00:00:00'), Timestamp('2025-03-31 00:00:00'), Timestamp('2025-03-31 00:00:00')]


In [7]:
# Upload dataset into evoml:
dataset = ec.Dataset.from_pandas(df, name="Economic Indicators")
dataset.put()
dataset.wait()

print(f"Dataset URL: {API_URL}/platform/datasets/view/{dataset.dataset_id}")


Dataset URL: https://evoml.ai/platform/datasets/view/68ca9e2ae5732dd64210a63b


In [8]:
config = ec.TrialConfig.with_models(
    models=[
        "ridge_regressor",
        # "bayesian_ridge_regressor",
        # "linear_regressor",
        # "lasso_regressor",
    ],
    task=ec.MlTask.regression,
    budget_mode=BudgetMode.fast,
    loss_funcs=["Root Mean Squared Error"],
    dataset_id=dataset.dataset_id,
)

config.options.splittingMethodOptions = SplitMethodOptions(
    method="percentage", 
    trainPercentage=0.8
)
config.options.enableBudgetTuning = False
# config.options.validationMethodOptions = ValidationMethodOptions(
#     method=ValidationMethod.holdout,
#     holdoutOptions=HoldoutOptions(size=0.2, keepOrder=True),
# )

trial, _ = ec.Trial.from_dataset_id(
    dataset.dataset_id,
    target_col=lead_target,  # Use the lead target for forecasting
    trial_name=f"Labour_cost_forecast_{lead_num}_period_ahead",
    config=config,
)

trial.run(timeout=900)

100%|██████████| 346/346 [00:00<00:00, 11780.22kb/s]


Couldnt match any status: ,status ispending
Couldnt match any status: ,status ispending


True

In [9]:
best_model = trial.get_best()


[32m2025-09-17 12:42:28.078[0m | [1mINFO    [0m | [36mevoml_client.pipeline[0m:[36mget_pipeline_report_when_ready[0m:[36m59[0m - [1mWaiting for pipeline report with id af80e3b0-56d6-40b8-8421-0e68ea1fd0b0 to be ready.[0m


BaseGenericException: Expecting value: line 1 column 1 (char 0)

In [None]:
# Build the model
best_model.build_model()

In [None]:
# Extract and display results properly
try:
    # Get metrics dataframe
    metrics_df = trial.get_metrics_dataframe()
    print("📊 Trial Metrics:")
    print(metrics_df)
    
    # Get best model info
    best_model = trial.get_best()
    best_model.build_model()
    
    # Extract model representation
    model_rep = best_model.model_rep
    model_rep_dict = model_rep.__dict__
    
    print(f"\n🏆 Best Model: {model_rep_dict.get('name', 'Unknown')}")
    
    # Extract metrics from model representation
    metrics = model_rep_dict.get('metrics', {})
    if 'regression-mse' in metrics:
        mse_metrics = metrics['regression-mse']
        if 'test' in mse_metrics:
            test_mse = mse_metrics['test'].get('average')
            if test_mse is not None:
                rmse = np.sqrt(test_mse)
                print(f"📈 RMSE: {rmse:.4f}")
                print(f"📈 MSE: {test_mse:.4f}")
            else:
                print("⚠️  Test MSE not available")
        else:
            print("⚠️  Test metrics not available")
    else:
        print("⚠️  Regression metrics not available")
    
    # Display all available metrics
    print(f"\n📋 All available metrics:")
    for metric_name, metric_data in metrics.items():
        print(f"  {metric_name}: {metric_data}")
        
except Exception as e:
    print(f"❌ Error extracting results: {e}")
    print("This might be due to the trial still running or an issue with result extraction")
    
    # Try to get basic trial info
    try:
        print(f"\n🔍 Trial state: {trial.state}")
        print(f"🔍 Trial ID: {trial.trial_id}")
    except:
        print("Could not get trial state information")


In [None]:
# Extract and display results properly
try:
    # Get metrics dataframe
    metrics_df = trial.get_metrics_dataframe()
    print("📊 Trial Metrics:")
    print(metrics_df)
    
    # Get best model info
    best_model = trial.get_best()
    best_model.build_model()
    
    # Extract model representation
    model_rep = best_model.model_rep
    model_rep_dict = model_rep.__dict__
    
    print(f"\n🏆 Best Model: {model_rep_dict.get('name', 'Unknown')}")
    
    # Extract metrics from model representation
    metrics = model_rep_dict.get('metrics', {})
    if 'regression-mse' in metrics:
        mse_metrics = metrics['regression-mse']
        if 'test' in mse_metrics:
            test_mse = mse_metrics['test'].get('average')
            if test_mse is not None:
                rmse = np.sqrt(test_mse)
                print(f"📈 RMSE: {rmse:.4f}")
                print(f"📈 MSE: {test_mse:.4f}")
            else:
                print("⚠️  Test MSE not available")
        else:
            print("⚠️  Test metrics not available")
    else:
        print("⚠️  Regression metrics not available")
    
    # Display all available metrics
    print(f"\n📋 All available metrics:")
    for metric_name, metric_data in metrics.items():
        print(f"  {metric_name}: {metric_data}")
        
except Exception as e:
    print(f"❌ Error extracting results: {e}")
    print("This might be due to the trial still running or an issue with result extraction")
    
    # Try to get basic trial info
    try:
        print(f"\n🔍 Trial state: {trial.state}")
        print(f"🔍 Trial ID: {trial.trial_id}")
    except:
        print("Could not get trial state information")
