In [None]:
import os
import pandas as pd
import numpy as np

import plotly.graph_objects as go
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [None]:
seed = 42

In [None]:
utils_folder = os.path.join("..", "..", "utils")

data_folder = os.path.join("..", "..", "data")
clean_data_folder = os.path.join(data_folder, "Clean Data")
metadata_folder = os.path.join(data_folder, "Metadata")
plot_folder = os.path.join(data_folder, "Plots", "Feltre")

sensor_folder = os.path.join(clean_data_folder, "sensors")

feltre_sqlites_folder = 'feltre_sqlites_second'

In [None]:
second_part_df = pd.read_excel(os.path.join(clean_data_folder, 'Feltre', 'second_part.xlsx'))

# Autocorrelation Analysis

In [None]:
second_part_df

In [None]:
target_variables = [
    'ICC [1/mL]',
    'HNAC [1/mL]',
    # 'LNAC [1/mL]',
    # 'HNAP [%]',
]

In [None]:
input_variables = [
    'Pressione [atm]',
    'TOCeq [mg/l]',
    'DOCeq [mg/l]',
    'Turbidity [FTU]',
    'Conductivity [uS/cm]',
    'Temperature [°C]',
    'pH',
    'Free Chlorine [mg/l]',
    'Nitrate [mg/l]',
    'UV254 [1/m]',
]

## Inputs

In [None]:
for variable in input_variables:
    fig, ax = plt.subplots(figsize=(20, 12))
    plot_acf(
        second_part_df[variable],
        lags=second_part_df.shape[0]-1,
        title=variable,
        use_vlines=True,
        ax=ax
    )
    
    variable_ = variable.replace('/', '_')
    
    # plt.savefig(f'autocorrelation_{variable_}.png')
    

## Output

In [None]:
for variable in target_variables:
    fig, ax = plt.subplots(figsize=(20, 12))
    plot_acf(
        second_part_df[variable],
        lags=second_part_df.shape[0]-1,
        title=variable,
        use_vlines=True,
        ax=ax
    )
    plt.show()
    

# Partial Autocorrelation Analysis

## Inputs

In [None]:
for variable in input_variables:
    fig, ax = plt.subplots(figsize=(15, 8))
    plot_pacf(
        second_part_df[variable],
        title=variable,
        use_vlines=True,
        ax=ax
    )
    
    variable_ = variable.replace('/', '_')
    
    # plt.savefig(f'p_autocorrelation_{variable_}.png')
    

## Output

In [None]:
for variable in target_variables:
    fig, ax = plt.subplots(figsize=(15, 8))
    plot_pacf(
        second_part_df[variable],
        title=variable,
        use_vlines=True,
        ax=ax
    )
    plt.show()
    

# ARIMA Orders Estimation

In [None]:
# fare anche la partial acf, che serve per valutare la partial autocorrelation

In [None]:
import pmdarima as pm

In [None]:
fixed_d_results = pd.DataFrame(
    index=input_variables + ['ICC [1/mL]', 'HNAC [1/mL]'],
    columns=['AR_order', 'I_order (fixed)', 'MA_order', 'AIC']
)

In [None]:
fixed_d_ma_results = pd.DataFrame(
    index=input_variables + ['ICC [1/mL]', 'HNAC [1/mL]'],
    columns=['AR_order', 'I_order (fixed)', 'MA_order (fixed)', 'AIC']
)

In [None]:
input_variables

In [None]:
no_fixed_results = pd.DataFrame(
    index=input_variables + ['ICC [1/mL]', 'HNAC [1/mL]'],
    columns=['AR_order', 'I_order', 'MA_order', 'AIC']
)


In [None]:
for variable in input_variables + ['ICC [1/mL]', 'HNAC [1/mL]']:
    model = pm.auto_arima(
        second_part_df[variable],
        seasonal=False,
        trace=False,
        error_action='ignore',
        suppress_warnings=True,
        stepwise=True,
        d=0,
        D=0,
        max_d=0,
        max_D=0,
    )
    
    results = model.get_params()
    fixed_d_results.loc[variable, 'AR_order'] = results['order'][0]
    fixed_d_results.loc[variable, 'I_order (fixed)'] = results['order'][1]
    fixed_d_results.loc[variable, 'MA_order'] = results['order'][2]
    fixed_d_results.loc[variable, 'AIC'] = model.aic()

In [None]:
for variable in input_variables + ['ICC [1/mL]', 'HNAC [1/mL]']:
    model = pm.auto_arima(
        second_part_df[variable],
        seasonal=False,
        trace=False,
        error_action='ignore',
        suppress_warnings=True,
        stepwise=True,
        d=0,
        D=0,
        max_d=0,
        max_D=0,
        start_q=0,
        start_Q=0,
        max_q=0,
        max_Q=0
    )
    
    results = model.get_params()
    fixed_d_ma_results.loc[variable, 'AR_order'] = results['order'][0]
    fixed_d_ma_results.loc[variable, 'I_order (fixed)'] = results['order'][1]
    fixed_d_ma_results.loc[variable, 'MA_order (fixed)'] = results['order'][2]
    fixed_d_ma_results.loc[variable, 'AIC'] = model.aic()

In [None]:
for variable in input_variables + ['ICC [1/mL]', 'HNAC [1/mL]']:
    model = pm.auto_arima(
        second_part_df[variable],
        seasonal=False,
        trace=False,
        error_action='ignore',
        suppress_warnings=True,
        stepwise=True,
    )
    
    results = model.get_params()
    no_fixed_results.loc[variable, 'AR_order'] = results['order'][0]
    no_fixed_results.loc[variable, 'I_order'] = results['order'][1]
    no_fixed_results.loc[variable, 'MA_order'] = results['order'][2]
    no_fixed_results.loc[variable, 'AIC'] = model.aic()

## Results

In [None]:
fixed_d_results

In [None]:
fixed_d_ma_results

In [None]:
no_fixed_results

In [None]:
best_results_df = pd.DataFrame(
    index=input_variables + ['ICC [1/mL]', 'HNAC [1/mL]'],
    columns=['AR_order', 'I_order', 'MA_order', 'AIC']
)

for variable in input_variables + ['ICC [1/mL]', 'HNAC [1/mL]']:
        best_result = min([no_fixed_results.loc[variable], fixed_d_results.loc[variable], fixed_d_ma_results.loc[variable]], key=lambda x: x['AIC'])
        best_results_df.loc[variable, 'AR_order'] = best_result.iloc[0]
        best_results_df.loc[variable, 'I_order'] = best_result.iloc[1]
        best_results_df.loc[variable, 'MA_order'] = best_result.iloc[2]
        best_results_df.loc[variable, 'AIC'] = best_result.iloc[3]

In [None]:
best_results_df

# Lag

In [None]:
def extend_features(df: pd.DataFrame, feature_name: str,  lags: int, rolling_window: int):
    
    if feature_name in df.columns:
        # add lagged, rolling and expanding features for each variable in df    
        if lags > 0:
        
            for lag in range(1, lags + 1):
                df[f"{feature_name}_lag{lag}"] = df[feature_name].shift(lag)
        
        if rolling_window > 0:        
            df[f"{feature_name}_rolling{rolling_window}"] = df[feature_name].rolling(rolling_window).mean()
        
    # fill NaN values with bfill
    df.bfill(inplace=True)
    
    return df

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
# scaler all the variables
scaler = MinMaxScaler()
second_part_df[input_variables] = scaler.fit_transform(second_part_df[input_variables])
second_part_df['ICC [1/mL]'] = scaler.fit_transform(second_part_df[['ICC [1/mL]']])
second_part_df['HNAC [1/mL]'] = scaler.fit_transform(second_part_df[['HNAC [1/mL]']])

In [None]:
# add lag = p_i + q_i for covariates
copy_df = second_part_df.copy()

for variable in input_variables:
    n_lags = fixed_d_results.loc[variable, 'AR_order'] + fixed_d_results.loc[variable, 'MA_order']
    copy_df = extend_features(copy_df, variable, n_lags, 0)

In [None]:
# just the AR part for the targets
for variable in ['ICC [1/mL]', 'HNAC [1/mL]']:
    n_lags = fixed_d_results.loc[variable, 'AR_order']
    copy_df = extend_features(copy_df, variable, n_lags, 0)

## ARX Models

In [None]:
from statsmodels.regression.linear_model import OLS

In [None]:
import statsmodels.api as sm

In [None]:
results_summaries_df = pd.DataFrame(
    index=input_variables,
    columns=['ICC [1/mL]', 'HNAC [1/mL]']
)

for input_variable in input_variables:
    for target_variable in ['ICC [1/mL]', 'HNAC [1/mL]']:
        
        exog_df = copy_df[[col for col in copy_df.columns if input_variable in col]]
        endog_df = copy_df[[col for col in copy_df.columns if target_variable in col]]
        
        # remove just the column target_variable
        endog_df.drop(columns=[target_variable], inplace=True)
        
        exog_df = pd.concat([exog_df, endog_df], axis=1)
        
        # add constant
        exog_df = sm.add_constant(exog_df)
        
        model = OLS(endog=copy_df[target_variable], exog=exog_df)
        results = model.fit()
        
        results_summaries_df.loc[input_variable, target_variable] = results.summary()

In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

In [None]:
sarimax_results_df = pd.DataFrame(
    index=input_variables,
    columns=['ICC [1/mL]', 'HNAC [1/mL]']
)

for input_variable in input_variables:
    
    exog_df = copy_df[[col for col in copy_df.columns if input_variable in col]]
    
    for target_variable in ['ICC [1/mL]', 'HNAC [1/mL]']:
        
        model = SARIMAX(
            copy_df[target_variable],
            exog=exog_df,
            order=(2, 0 , 0),
        )
        
        results = model.fit()
        sarimax_results_df.loc[input_variable, target_variable] = results.summary()

In [None]:
# Print the results and compare with the OLS results
for input_variable in input_variables:
    for target_variable in ['ICC [1/mL]', 'HNAC [1/mL]']:
        print("="*100)
        print(f"Results for {input_variable} and {target_variable}:")
        print("="*100)
        print("===SARIMAX===")
        print(sarimax_results_df.loc[input_variable, target_variable])
        print("===OLS===")
        print(results_summaries_df.loc[input_variable, target_variable])
    

## Granger Causality

In [None]:
from statsmodels.tsa.stattools import grangercausalitytests

In [None]:
# perform granger causality test for each input variable and target variable

granger_results_df = pd.DataFrame(
    index=input_variables,
    columns=['ICC [1/mL]', 'HNAC [1/mL]']
)

for input_variable in input_variables:
    
    maxlag = fixed_d_results.loc[input_variable, 'AR_order'] + fixed_d_results.loc[input_variable, 'MA_order']
    
    for target_variable in ['ICC [1/mL]', 'HNAC [1/mL]']:
        test_results = grangercausalitytests(x=copy_df[[target_variable, input_variable]], maxlag=maxlag)
        
        granger_results_df.loc[input_variable, target_variable] = [key for key in test_results.keys() if test_results[key][0]['ssr_ftest'][1] < 0.05] 

In [None]:
granger_results_df

In [None]:
# just the conductivity has no granger causality with the targets
# so we can remove it from the input variables
input_variables = [var for var in input_variables if var != 'Conductivity [mS/cm]']

# Covariates Filtering

## Backward Feature Selection with AIC

In [None]:
def backward_feature_selection_aic(df, target_variable, input_variables_list, ar_order=2):
    """
    Perform backward feature selection based on AIC for a given target variable.
    
    Parameters:
    - df: DataFrame containing all features
    - target_variable: String, name of the target variable
    - input_variables_list: List of input variable names (without lags)
    - ar_order: Integer, AR order for the target variable
    
    Returns:
    - selected_features: List of selected feature names
    - aic_scores: Dictionary with AIC scores at each step
    """
    
    # Get all lagged features for input variables
    all_input_features = []
    for var in input_variables_list:
        all_input_features.extend([col for col in df.columns if var in col and col != var])
        all_input_features.append(var)  # Include the original variable too
    
    # Get lagged features for target variable (autoregressive terms)
    target_features = [col for col in df.columns if target_variable in col and col != target_variable]
    
    # All features to consider
    current_features = all_input_features + target_features
    current_features = list(set(current_features))  # Remove duplicates
    
    # Initialize results
    aic_scores = {}
    iteration = 0
    
    print(f"\n=== Backward Feature Selection for {target_variable} ===")
    print(f"Starting with {len(current_features)} features")
    
    while len(current_features) > 1:
        # Fit model with current features
        exog_df = df[current_features].copy()
        exog_df = sm.add_constant(exog_df)
        
        try:
            model = OLS(endog=df[target_variable], exog=exog_df)
            results = model.fit()
            current_aic = results.aic
            
            print(f"\nIteration {iteration}: AIC = {current_aic:.4f} with {len(current_features)} features")
            aic_scores[f"iteration_{iteration}"] = {
                'features': current_features.copy(),
                'aic': current_aic,
                'n_features': len(current_features)
            }
            
            # Try removing each feature and calculate AIC
            best_aic = current_aic
            feature_to_remove = None
            
            # remove one feature at a time and check if the AIC decreases
            # NOTE: it removes the feature that decreases the AIC the most
            for feature in current_features:
                temp_features = [f for f in current_features if f != feature]
                
                if len(temp_features) > 0:
                    temp_exog = df[temp_features].copy()
                    temp_exog = sm.add_constant(temp_exog)
                    
                    try:
                        temp_model = OLS(endog=df[target_variable], exog=temp_exog)
                        temp_results = temp_model.fit()
                        temp_aic = temp_results.aic
                        
                        # If removing this feature improves (decreases) AIC, mark it for removal
                        if temp_aic < best_aic:
                            best_aic = temp_aic
                            feature_to_remove = feature
                            
                    except Exception as e:
                        print(f"Warning: Could not fit model without {feature}: {e}")
                        continue
            
            # If we found a feature to remove that improves AIC, remove it
            if feature_to_remove is not None:
                current_features.remove(feature_to_remove)
                print(f"Removed {feature_to_remove}, new AIC: {best_aic:.4f}")
            else:
                print("No feature removal improves AIC. Stopping.")
                break
                
            iteration += 1
            
            # Safety check to avoid infinite loops
            if iteration > 50:
                print("Reached maximum iterations (50). Stopping.")
                break
                
        except Exception as e:
            print(f"Error in iteration {iteration}: {e}")
            break
    
    # Final model
    if len(current_features) > 0:
        final_exog = df[current_features].copy()
        final_exog = sm.add_constant(final_exog)
        
        try:
            final_model = OLS(endog=df[target_variable], exog=final_exog)
            final_results = final_model.fit()
            final_aic = final_results.aic
            
            aic_scores[f"final"] = {
                'features': current_features.copy(),
                'aic': final_aic,
                'n_features': len(current_features),
                'model_summary': final_results.summary()
            }
            
            print(f"\nFinal model: AIC = {final_aic:.4f} with {len(current_features)} features")
            print(f"Selected features: {current_features}")
            
        except Exception as e:
            print(f"Error fitting final model: {e}")
    
    return current_features, aic_scores


In [None]:
# Apply backward feature selection for both target variables
bfs_results = {}

# For ICC [1/mL]
print("Starting backward feature selection for ICC [1/mL]...")
icc_ar_order = 2  # Use default value or extract properly
selected_features_icc, aic_scores_icc = backward_feature_selection_aic(
    df=copy_df, 
    target_variable='ICC [1/mL]', 
    input_variables_list=input_variables,
    ar_order=icc_ar_order
)

bfs_results['ICC [1/mL]'] = {
    'selected_features': selected_features_icc,
    'aic_scores': aic_scores_icc
}

# For HNAC [1/mL]
print("\n" + "="*80)
print("Starting backward feature selection for HNAC [1/mL]...")
hnac_ar_order = 2  # Use default value or extract properly
selected_features_hnac, aic_scores_hnac = backward_feature_selection_aic(
    df=copy_df, 
    target_variable='HNAC [1/mL]', 
    input_variables_list=input_variables,
    ar_order=hnac_ar_order
)

bfs_results['HNAC [1/mL]'] = {
    'selected_features': selected_features_hnac,
    'aic_scores': aic_scores_hnac
}


In [None]:
# Display results summary
print("\n" + "="*100)
print("BACKWARD FEATURE SELECTION RESULTS SUMMARY")
print("="*100)

for target in ['ICC [1/mL]', 'HNAC [1/mL]']:
    print(f"\n{target}:")
    print("-" * 50)
    
    selected_features = bfs_results[target]['selected_features']
    aic_scores = bfs_results[target]['aic_scores']
    
    print(f"Number of selected features: {len(selected_features)}")
    print(f"Selected features: {selected_features}")
    
    if 'final' in aic_scores:
        final_aic = aic_scores['final']['aic']
        print(f"Final AIC: {final_aic:.4f}")
        
        # Display the final model summary
        print(f"\nFinal Model Summary for {target}:")
        print(aic_scores['final']['model_summary'])
    
    print(f"\nAIC progression:")
    for step_key in aic_scores.keys():
        if step_key != 'final':
            step_data = aic_scores[step_key]
            print(f"  {step_key}: AIC = {step_data['aic']:.4f}, Features = {step_data['n_features']}")
    
    print("\\n" + "="*50)


In [None]:
# Create a summary DataFrame of selected features
bfs_summary = pd.DataFrame({
    'Target_Variable': ['ICC [1/mL]', 'HNAC [1/mL]'],
    'Selected_Features_Count': [
        len(bfs_results['ICC [1/mL]']['selected_features']),
        len(bfs_results['HNAC [1/mL]']['selected_features'])
    ],
    'Final_AIC': [
        bfs_results['ICC [1/mL]']['aic_scores']['final']['aic'] if 'final' in bfs_results['ICC [1/mL]']['aic_scores'] else None,
        bfs_results['HNAC [1/mL]']['aic_scores']['final']['aic'] if 'final' in bfs_results['HNAC [1/mL]']['aic_scores'] else None
    ],
    'Selected_Features': [
        ', '.join(bfs_results['ICC [1/mL]']['selected_features']),
        ', '.join(bfs_results['HNAC [1/mL]']['selected_features'])
    ]
})

print("\nBackward Feature Selection Summary:")
print(bfs_summary.to_string(index=False, max_colwidth=80))


## Lasso

In [None]:
from sklearn.linear_model import Lasso, LassoCV
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')


In [None]:
def lasso_feature_selection(df, target_variable, input_variables_list, alpha_values=None, cv_folds=5):
    """
    Perform Lasso feature selection for a given target variable.
    
    Parameters:
    - df: DataFrame containing all features
    - target_variable: String, name of the target variable
    - input_variables_list: List of input variable names (without lags)
    - alpha_values: List of alpha values for cross-validation. If None, uses default range
    - cv_folds: Number of cross-validation folds
    
    Returns:
    - selected_features: List of selected feature names
    - lasso_results: Dictionary with Lasso results and performance metrics
    """
    
    # Get all lagged features for input variables
    all_input_features = []
    for var in input_variables_list:
        all_input_features.extend([col for col in df.columns if var in col and col != var])
        all_input_features.append(var)  # Include the original variable too
    
    # Get lagged features for target variable (autoregressive terms)
    target_features = [col for col in df.columns if target_variable in col and col != target_variable]
    
    # All features to consider
    all_features = all_input_features + target_features
    all_features = list(set(all_features))  # Remove duplicates
    
    # Prepare data
    X = df[all_features].copy()
    y = df[target_variable].copy()
    
    # Remove any rows with NaN values
    mask = ~(X.isnull().any(axis=1) | y.isnull())
    X = X[mask]
    y = y[mask]
    
    print(f"\n=== Lasso Feature Selection for {target_variable} ===")
    print(f"Dataset shape: {X.shape}")
    print(f"Starting with {X.shape[1]} features")
    
    # Standardize features
    
    # Set alpha values for cross-validation if not provided
    if alpha_values is None:
        alpha_values = np.logspace(-4, 2, 50)
    
    # Perform cross-validation to find optimal alpha
    print("Performing cross-validation to find optimal alpha...")
    lasso_cv = LassoCV(alphas=alpha_values, cv=cv_folds, random_state=42, max_iter=10000)
    lasso_cv.fit(X, y)
    
    optimal_alpha = lasso_cv.alpha_
    print(f"Optimal alpha: {optimal_alpha:.6f}")
    
    # Fit Lasso with optimal alpha
    lasso = Lasso(alpha=optimal_alpha, random_state=42, max_iter=10000)
    lasso.fit(X, y)
    
    # Get selected features (non-zero coefficients)
    selected_mask = lasso.coef_ != 0
    selected_features = [all_features[i] for i in range(len(all_features)) if selected_mask[i]]
    selected_coefficients = lasso.coef_[selected_mask]
    
    print(f"Number of selected features: {len(selected_features)}")
    print(f"Selected features: {selected_features}")
    
    # Calculate performance metrics
    y_pred = lasso.predict(X)
    mse = mean_squared_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    
    print(f"\nModel Performance:")
    print(f"R² Score: {r2:.6f}")
    print(f"MSE: {mse:.6f}")
    print(f"RMSE: {np.sqrt(mse):.6f}")
    
    # Create results dictionary
    lasso_results = {
        'optimal_alpha': optimal_alpha,
        'selected_features': selected_features,
        'selected_coefficients': selected_coefficients,
        'feature_names': all_features,
        'all_coefficients': lasso.coef_,
        'r2_score': r2,
        'mse': mse,
        'rmse': np.sqrt(mse),
        'cv_scores': lasso_cv.mse_path_.mean(axis=1),
        'alpha_values': alpha_values,
        'scaler': scaler,
        'model': lasso
    }
    
    return selected_features, lasso_results


In [None]:
# Apply Lasso feature selection for both target variables
lasso_results = {}

# For ICC [1/mL]
print("Starting Lasso feature selection for ICC [1/mL]...")
selected_features_icc_lasso, lasso_scores_icc = lasso_feature_selection(
    df=copy_df, 
    target_variable='ICC [1/mL]', 
    input_variables_list=input_variables,
    cv_folds=5
)

lasso_results['ICC [1/mL]'] = {
    'selected_features': selected_features_icc_lasso,
    'lasso_scores': lasso_scores_icc
}

# For HNAC [1/mL]
print("\n" + "="*80)
print("Starting Lasso feature selection for HNAC [1/mL]...")
selected_features_hnac_lasso, lasso_scores_hnac = lasso_feature_selection(
    df=copy_df, 
    target_variable='HNAC [1/mL]', 
    input_variables_list=input_variables,
    cv_folds=5
)

lasso_results['HNAC [1/mL]'] = {
    'selected_features': selected_features_hnac_lasso,
    'lasso_scores': lasso_scores_hnac
}


In [None]:
# Display Lasso results summary
print("\n" + "="*100)
print("LASSO FEATURE SELECTION RESULTS SUMMARY")
print("="*100)

for target in ['ICC [1/mL]', 'HNAC [1/mL]']:
    print(f"\n{target}:")
    print("-" * 50)
    
    selected_features = lasso_results[target]['selected_features']
    lasso_scores = lasso_results[target]['lasso_scores']
    
    print(f"Number of selected features: {len(selected_features)}")
    print(f"Selected features: {selected_features}")
    print(f"Optimal alpha: {lasso_scores['optimal_alpha']:.6f}")
    print(f"R² Score: {lasso_scores['r2_score']:.6f}")
    print(f"MSE: {lasso_scores['mse']:.6f}")
    print(f"RMSE: {lasso_scores['rmse']:.6f}")
    
    # Display selected features with their coefficients
    print(f"\nSelected Features and Coefficients:")
    for i, (feature, coef) in enumerate(zip(selected_features, lasso_scores['selected_coefficients'])):
        print(f"  {i+1:2d}. {feature:30s} = {coef:10.6f}")
    
    print("\n" + "="*50)


In [None]:
# Plot cross-validation results for alpha selection
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

for i, target in enumerate(['ICC [1/mL]', 'HNAC [1/mL]']):
    lasso_scores = lasso_results[target]['lasso_scores']
    
    # Plot CV scores vs alpha
    axes[i].semilogx(lasso_scores['alpha_values'], lasso_scores['cv_scores'])
    axes[i].axvline(x=lasso_scores['optimal_alpha'], color='red', linestyle='--', 
                   label=f'Optimal α = {lasso_scores["optimal_alpha"]:.6f}')
    axes[i].set_xlabel('Alpha (Regularization Parameter)')
    axes[i].set_ylabel('Cross-Validation MSE')
    axes[i].set_title(f'Lasso CV Results - {target}')
    axes[i].legend()
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print final summary
print("\n" + "="*100)
print("FINAL SUMMARY")
print("="*100)
print("The Lasso feature selection automatically determines the optimal regularization")
print("parameter (alpha) through cross-validation and selects features by driving")
print("less important coefficients to zero.")
print("\nKey advantages of Lasso vs Backward Feature Selection:")
print("- Automatic regularization parameter tuning")
print("- Handles multicollinearity better")
print("- Computationally more efficient for large feature sets")
print("- Provides coefficient magnitudes for feature importance ranking")
print("\nBoth methods can be used complementary to validate feature importance.")

In [None]:
# Create comparison DataFrame between BFS and Lasso results
comparison_results = pd.DataFrame({
    'Target_Variable': ['ICC [1/mL]', 'HNAC [1/mL]'],
    'BFS_Features_Count': [
        len(bfs_results['ICC [1/mL]']['selected_features']),
        len(bfs_results['HNAC [1/mL]']['selected_features'])
    ],
    'BFS_Final_AIC': [
        bfs_results['ICC [1/mL]']['aic_scores']['final']['aic'],
        bfs_results['HNAC [1/mL]']['aic_scores']['final']['aic']
    ],
    'Lasso_Features_Count': [
        len(lasso_results['ICC [1/mL]']['selected_features']),
        len(lasso_results['HNAC [1/mL]']['selected_features'])
    ],
    'Lasso_R2_Score': [
        lasso_results['ICC [1/mL]']['lasso_scores']['r2_score'],
        lasso_results['HNAC [1/mL]']['lasso_scores']['r2_score']
    ],
    'Lasso_RMSE': [
        lasso_results['ICC [1/mL]']['lasso_scores']['rmse'],
        lasso_results['HNAC [1/mL]']['lasso_scores']['rmse']
    ],
    'Lasso_Alpha': [
        lasso_results['ICC [1/mL]']['lasso_scores']['optimal_alpha'],
        lasso_results['HNAC [1/mL]']['lasso_scores']['optimal_alpha']
    ]
})

print("\n" + "="*100)
print("COMPARISON: BACKWARD FEATURE SELECTION vs LASSO")
print("="*100)
print(comparison_results.to_string(index=False))


In [None]:
# Analyze feature overlap between BFS and Lasso methods
print("\n" + "="*100)
print("FEATURE OVERLAP ANALYSIS")
print("="*100)

for target in ['ICC [1/mL]', 'HNAC [1/mL]']:
    print(f"\n{target}:")
    print("-" * 50)
    
    bfs_features = set(bfs_results[target]['selected_features'])
    lasso_features = set(lasso_results[target]['selected_features'])
    
    overlap = bfs_features.intersection(lasso_features)
    bfs_only = bfs_features - lasso_features
    lasso_only = lasso_features - bfs_features
    
    print(f"BFS selected features ({len(bfs_features)}): {sorted(list(bfs_features))}")
    print(f"Lasso selected features ({len(lasso_features)}): {sorted(list(lasso_features))}")
    print(f"\nOverlapping features ({len(overlap)}): {sorted(list(overlap))}")
    print(f"BFS only ({len(bfs_only)}): {sorted(list(bfs_only))}")
    print(f"Lasso only ({len(lasso_only)}): {sorted(list(lasso_only))}")
    
    # Calculate overlap percentage
    if len(bfs_features) > 0 and len(lasso_features) > 0:
        overlap_pct_bfs = len(overlap) / len(bfs_features) * 100
        overlap_pct_lasso = len(overlap) / len(lasso_features) * 100
        print(f"\nOverlap percentage (relative to BFS): {overlap_pct_bfs:.1f}%")
        print(f"Overlap percentage (relative to Lasso): {overlap_pct_lasso:.1f}%")
    
    print("\n" + "="*50)


In [None]:
# Create comprehensive summary DataFrame with all information
print("\n" + "="*100)
print("CREATING COMPREHENSIVE SUMMARY DATAFRAME")
print("="*100)

# Initialize summary data
summary_data = []

for target in ['ICC [1/mL]', 'HNAC [1/mL]']:
    # Get BFS results
    bfs_features = bfs_results[target]['selected_features']
    bfs_aic = bfs_results[target]['aic_scores']['final']['aic']
    bfs_n_features = len(bfs_features)
    
    # Get Lasso results
    lasso_features = lasso_results[target]['selected_features']
    lasso_scores = lasso_results[target]['lasso_scores']
    lasso_n_features = len(lasso_features)
    
    # Calculate overlap
    bfs_set = set(bfs_features)
    lasso_set = set(lasso_features)
    overlap = bfs_set.intersection(lasso_set)
    overlap_count = len(overlap)
    overlap_pct_bfs = (overlap_count / bfs_n_features * 100) if bfs_n_features > 0 else 0
    overlap_pct_lasso = (overlap_count / lasso_n_features * 100) if lasso_n_features > 0 else 0
    
    # Feature categories
    bfs_only = bfs_set - lasso_set
    lasso_only = lasso_set - bfs_set
    
    summary_data.append({
        'Target_Variable': target,
        'BFS_Features_Count': bfs_n_features,
        'BFS_Final_AIC': round(bfs_aic, 4),
        'BFS_Selected_Features': '; '.join(sorted(bfs_features)),
        'Lasso_Features_Count': lasso_n_features,
        'Lasso_Optimal_Alpha': round(lasso_scores['optimal_alpha'], 6),
        'Lasso_R2_Score': round(lasso_scores['r2_score'], 6),
        'Lasso_RMSE': round(lasso_scores['rmse'], 4),
        'Lasso_Selected_Features': '; '.join(sorted(lasso_features)),
        'Overlap_Count': overlap_count,
        'Overlap_Pct_vs_BFS': round(overlap_pct_bfs, 1),
        'Overlap_Pct_vs_Lasso': round(overlap_pct_lasso, 1),
        'Common_Features': '; '.join(sorted(overlap)) if overlap else 'None',
        
        'BFS_Only_Features': '; '.join(sorted(bfs_only)) if bfs_only else 'None',
        'Lasso_Only_Features': '; '.join(sorted(lasso_only)) if lasso_only else 'None'
    })

# Create DataFrame
comprehensive_summary = pd.DataFrame(summary_data)

print("Comprehensive summary DataFrame created successfully!")
print(f"Shape: {comprehensive_summary.shape}")
print("\nColumns included:")
for i, col in enumerate(comprehensive_summary.columns, 1):
    print(f"{i:2d}. {col}")

comprehensive_summary


In [None]:
# Create detailed feature-level analysis
print("\n" + "="*80)
print("CREATING DETAILED FEATURE-LEVEL ANALYSIS")
print("="*80)

# Get all unique features from both methods
all_features_set = set()
for target in ['ICC [1/mL]', 'HNAC [1/mL]']:
    all_features_set.update(bfs_results[target]['selected_features'])
    all_features_set.update(lasso_results[target]['selected_features'])

all_features_list = sorted(list(all_features_set))

# Create feature-level summary
feature_summary_data = []

for feature in all_features_list:
    feature_info = {'Feature_Name': feature}
    
    # Determine feature type and base variable
    if '_lag' in feature:
        base_var = feature.split('_lag')[0]
        lag_num = feature.split('_lag')[1]
        feature_type = f"Lagged ({lag_num})"
    else:
        base_var = feature
        feature_type = "Original"
    
    feature_info['Base_Variable'] = base_var
    feature_info['Feature_Type'] = feature_type
    
    # Check selection by each method for each target
    for target in ['ICC [1/mL]', 'HNAC [1/mL]']:
        target_short = target.replace(' [1/mL]', '').replace(' ', '_')
        
        # BFS selection
        bfs_selected = feature in bfs_results[target]['selected_features']
        feature_info[f'{target_short}_BFS_Selected'] = 'Yes' if bfs_selected else 'No'
        
        # Lasso selection and coefficient
        lasso_selected = feature in lasso_results[target]['selected_features']
        feature_info[f'{target_short}_Lasso_Selected'] = 'Yes' if lasso_selected else 'No'
        
        if lasso_selected:
            # Get coefficient value
            lasso_features = lasso_results[target]['lasso_scores']['selected_features']
            lasso_coefs = lasso_results[target]['lasso_scores']['selected_coefficients']
            if feature in lasso_features:
                coef_idx = lasso_features.index(feature)
                feature_info[f'{target_short}_Lasso_Coefficient'] = round(lasso_coefs[coef_idx], 6)
            else:
                feature_info[f'{target_short}_Lasso_Coefficient'] = 0.0
        else:
            feature_info[f'{target_short}_Lasso_Coefficient'] = 0.0
    
    # Summary statistics
    total_selections = sum([
        1 if feature_info[f'{target.replace(" [1/mL]", "").replace(" ", "_")}_BFS_Selected'] == 'Yes' else 0
        for target in ['ICC [1/mL]', 'HNAC [1/mL]']
    ]) + sum([
        1 if feature_info[f'{target.replace(" [1/mL]", "").replace(" ", "_")}_Lasso_Selected'] == 'Yes' else 0
        for target in ['ICC [1/mL]', 'HNAC [1/mL]']
    ])
    
    feature_info['Total_Selections'] = total_selections
    feature_info['Selection_Consistency'] = 'High' if total_selections >= 3 else 'Medium' if total_selections == 2 else 'Low'
    
    feature_summary_data.append(feature_info)

# Create detailed feature DataFrame
detailed_feature_summary = pd.DataFrame(feature_summary_data)

print(f"Detailed feature summary created with {len(detailed_feature_summary)} features")
print("\nFeature selection consistency summary:")
consistency_counts = detailed_feature_summary['Selection_Consistency'].value_counts()
for consistency, count in consistency_counts.items():
    print(f"  {consistency} consistency: {count} features")

detailed_feature_summary


In [None]:
%%script false --no-raise-error
# Save all summary data to Excel files
print("\n" + "="*80)
print("SAVING RESULTS TO EXCEL FILES")
print("="*80)

# Create output filename with timestamp
from datetime import datetime
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_filename = f"feature_selection_results_{timestamp}.xlsx"

try:
    with pd.ExcelWriter(output_filename, engine='openpyxl') as writer:
        # Save comprehensive summary
        comprehensive_summary.to_excel(writer, sheet_name='Summary_by_Target', index=False)
        
        # Save detailed feature analysis
        detailed_feature_summary.to_excel(writer, sheet_name='Detailed_Feature_Analysis', index=False)
        
        # Save BFS specific results
        bfs_only_summary = pd.DataFrame([
            {
                'Target': target,
                'Selected_Features': '; '.join(bfs_results[target]['selected_features']),
                'Feature_Count': len(bfs_results[target]['selected_features']),
                'Final_AIC': bfs_results[target]['aic_scores']['final']['aic'],
                'Iterations': len([k for k in bfs_results[target]['aic_scores'].keys() if k.startswith('iteration')])
            }
            for target in ['ICC [1/mL]', 'HNAC [1/mL]']
        ])
        bfs_only_summary.to_excel(writer, sheet_name='BFS_Results', index=False)
        
        # Save Lasso specific results
        lasso_only_summary = pd.DataFrame([
            {
                'Target': target,
                'Selected_Features': '; '.join(lasso_results[target]['selected_features']),
                'Feature_Count': len(lasso_results[target]['selected_features']),
                'Optimal_Alpha': lasso_results[target]['lasso_scores']['optimal_alpha'],
                'R2_Score': lasso_results[target]['lasso_scores']['r2_score'],
                'RMSE': lasso_results[target]['lasso_scores']['rmse']
            }
            for target in ['ICC [1/mL]', 'HNAC [1/mL]']
        ])
        lasso_only_summary.to_excel(writer, sheet_name='Lasso_Results', index=False)
        
    print(f"✓ Results saved to: {output_filename}")
    
except Exception as e:
    print(f"✗ Error saving to Excel: {e}")
    print("Continuing with in-memory analysis...")


# Train-Test Split Analysis

Now we perform the same feature selection analysis with a proper train-test split (60-40) to evaluate model generalization performance. The process follows these steps:

1. **Data Splitting**: 60% training, 40% testing
2. **Scaling**: Fit on training set, apply to test set
3. **Granger Causality**: Performed on training set only
4. **Backward Feature Selection**: Uses both train and test sets for model fitting
5. **Lasso**: Cross-validation on training set, evaluation on test set

## Data Splitting 

In [None]:
# Split the dataset into train and test sets (60-40)
from sklearn.model_selection import train_test_split

print("="*80)
print("TRAIN-TEST SPLIT")
print("="*80)

# Calculate split index for time series (chronological split)
n_samples = len(copy_df)
train_size = int(0.6 * n_samples)

# Chronological split (important for time series data)
train_df = copy_df.iloc[:train_size].copy()
test_df = copy_df.iloc[train_size:].copy()

print(f"Original dataset size: {n_samples}")
print(f"Training set size: {len(train_df)} ({len(train_df)/n_samples*100:.1f}%)")
print(f"Test set size: {len(test_df)} ({len(test_df)/n_samples*100:.1f}%)")

# Display date ranges if available
if 'date' in copy_df.columns or any('date' in col.lower() for col in copy_df.columns):
    print(f"Training period: {train_df.index[0]} to {train_df.index[-1]}")
    print(f"Test period: {test_df.index[0]} to {test_df.index[-1]}")
else:
    print(f"Training indices: {train_df.index[0]} to {train_df.index[-1]}")
    print(f"Test indices: {test_df.index[0]} to {test_df.index[-1]}")

print(f"\nTarget variables statistics:")
for target in target_variables:
    train_mean = train_df[target].mean()
    test_mean = test_df[target].mean()
    train_std = train_df[target].std()
    test_std = test_df[target].std()
    
    print(f"  {target}:")
    print(f"    Train: μ={train_mean:.4f}, σ={train_std:.4f}")
    print(f"    Test:  μ={test_mean:.4f}, σ={test_std:.4f}")
    print(f"    Difference in means: {abs(train_mean - test_mean):.4f}")

print("="*80)


## Granger Causality on Training Set

In [None]:
# Perform Granger causality test on training set only
granger_results_train_df = pd.DataFrame(
    index=input_variables,
    columns=['ICC [1/mL]', 'HNAC [1/mL]']
)

print("="*80)
print("GRANGER CAUSALITY TEST - TRAINING SET")
print("="*80)

for input_variable in input_variables:
    
    # Use a safe default maxlag value
    maxlag = 5  # Default reasonable lag for Granger causality test
    
    for target_variable in ['ICC [1/mL]', 'HNAC [1/mL]']:
        print(f"\nTesting {input_variable} -> {target_variable} (max lag: {maxlag})")
        
        try:
            test_results = grangercausalitytests(
                x=train_df[[target_variable, input_variable]], 
                maxlag=maxlag,
                verbose=False
            )
            
            # Get significant lags (p < 0.05)
            significant_lags = [key for key in test_results.keys() 
                              if test_results[key][0]['ssr_ftest'][1] < 0.05]
            
            granger_results_train_df.loc[input_variable, target_variable] = significant_lags
            
            if significant_lags:
                p_values = [test_results[key][0]['ssr_ftest'][1] for key in significant_lags]
                print(f"  ✓ Significant lags: {significant_lags}")
                print(f"  ✓ P-values: {[f'{p:.4f}' for p in p_values]}")
            else:
                print(f"  ✗ No significant Granger causality")
                
        except Exception as e:
            print(f"  ⚠ Error testing {input_variable} -> {target_variable}: {e}")
            granger_results_train_df.loc[input_variable, target_variable] = []

print(f"\n" + "="*80)
print("GRANGER CAUSALITY RESULTS - TRAINING SET")
print("="*80)


In [None]:
# Display Granger causality results for training set
print(granger_results_train_df)

# Filter input variables based on Granger causality results from training set
input_variables_train = []
for var in input_variables:
    has_causality = False
    for target in ['ICC [1/mL]', 'HNAC [1/mL]']:
        if len(granger_results_train_df.loc[var, target]) > 0:
            has_causality = True
            break
    if has_causality:
        input_variables_train.append(var)

print(f"\nVariables with Granger causality (training set): {len(input_variables_train)}")
print(f"Selected variables: {input_variables_train}")

# Variables removed due to no Granger causality
removed_vars = [var for var in input_variables if var not in input_variables_train]
if removed_vars:
    print(f"Removed variables (no Granger causality): {removed_vars}")
else:
    print("All variables show Granger causality relationship")


## Backward Feature Selection with Train-Test Split

In [None]:
def backward_feature_selection_train_test(train_df, test_df, target_variable, input_variables_list, ar_order=2):
    """
    Perform backward feature selection using only the training set, then evaluate on test set.
    
    Parameters:
    - train_df: Training DataFrame
    - test_df: Test DataFrame  
    - target_variable: String, name of the target variable
    - input_variables_list: List of input variable names (without lags)
    - ar_order: Integer, AR order for the target variable
    
    Returns:
    - selected_features: List of selected feature names
    - results: Dictionary with results and performance metrics
    """
    
    # Get all lagged features for input variables
    all_input_features = []
    for var in input_variables_list:
        all_input_features.extend([col for col in train_df.columns if var in col and col != var])
        all_input_features.append(var)  # Include the original variable too
    
    # Get lagged features for target variable (autoregressive terms)
    target_features = [col for col in train_df.columns if target_variable in col and col != target_variable]
    
    # All features to consider
    current_features = all_input_features + target_features
    current_features = list(set(current_features))  # Remove duplicates
    
    # Initialize results
    aic_scores = {}
    iteration = 0
    
    print(f"\\n=== Backward Feature Selection for {target_variable} (Train-Only) ===")
    print(f"Training set: {len(train_df)} samples")
    print(f"Test set: {len(test_df)} samples")
    print(f"Starting with {len(current_features)} features")
    print("Feature selection performed on training set only")
    
    while len(current_features) > 1:
        # Fit model with current features on training dataset only
        exog_df = train_df[current_features].copy()
        exog_df = sm.add_constant(exog_df)
        
        try:
            model = OLS(endog=train_df[target_variable], exog=exog_df)
            results = model.fit()
            current_aic = results.aic
            
            print(f"\\nIteration {iteration}: AIC = {current_aic:.4f} with {len(current_features)} features")
            aic_scores[f"iteration_{iteration}"] = {
                'features': current_features.copy(),
                'aic': current_aic,
                'n_features': len(current_features)
            }
            
            # Try removing each feature and calculate AIC
            best_aic = current_aic
            feature_to_remove = None
            
            for feature in current_features:
                temp_features = [f for f in current_features if f != feature]
                
                if len(temp_features) > 0:
                    temp_exog = train_df[temp_features].copy()
                    temp_exog = sm.add_constant(temp_exog)
                    
                    try:
                        temp_model = OLS(endog=train_df[target_variable], exog=temp_exog)
                        temp_results = temp_model.fit()
                        temp_aic = temp_results.aic
                        
                        # If removing this feature improves (decreases) AIC, mark it for removal
                        if temp_aic < best_aic:
                            best_aic = temp_aic
                            feature_to_remove = feature
                            
                    except Exception as e:
                        print(f"Warning: Could not fit model without {feature}: {e}")
                        continue
            
            # If we found a feature to remove that improves AIC, remove it
            if feature_to_remove is not None:
                current_features.remove(feature_to_remove)
                print(f"Removed {feature_to_remove}, new AIC: {best_aic:.4f}")
            else:
                print("No feature removal improves AIC. Stopping.")
                break
                
            iteration += 1
            
            # Safety check to avoid infinite loops
            if iteration > 50:
                print("Reached maximum iterations (50). Stopping.")
                break
                
        except Exception as e:
            print(f"Error in iteration {iteration}: {e}")
            break
    
    # Final model evaluation on train and test separately
    if len(current_features) > 0:
        # Train set evaluation
        final_exog_train = train_df[current_features].copy()
        final_exog_train = sm.add_constant(final_exog_train)
        
        # Test set evaluation  
        final_exog_test = test_df[current_features].copy()
        final_exog_test = sm.add_constant(final_exog_test)
        
        try:
            # Train model
            train_model = OLS(endog=train_df[target_variable], exog=final_exog_train)
            train_results = train_model.fit()
            train_pred = train_results.predict(final_exog_train)
            train_aic = train_results.aic
            
            # Test predictions using train model
            test_pred = train_results.predict(final_exog_test)
            test_mse = mean_squared_error(test_df[target_variable], test_pred)
            test_r2 = r2_score(test_df[target_variable], test_pred)
            
            results_dict = {
                'selected_features': current_features.copy(),
                'train_aic': train_aic,
                'test_mse': test_mse,
                'test_r2': test_r2,
                'test_rmse': np.sqrt(test_mse),
                'aic_progression': aic_scores,
                'train_model': train_results
            }
            
            print(f"\\nFinal model performance:")
            print(f"  Train AIC: {train_aic:.4f}")
            print(f"  Test R²: {test_r2:.6f}")
            print(f"  Test RMSE: {np.sqrt(test_mse):.6f}")
            print(f"  Selected features ({len(current_features)}): {current_features}")
            
        except Exception as e:
            print(f"Error fitting final model: {e}")
            results_dict = {
                'selected_features': current_features.copy(),
                'error': str(e)
            }
    
    return current_features, results_dict


In [None]:
# Apply backward feature selection with train-test split
bfs_train_test_results = {}

# Use original input_variables list for consistency
input_vars_to_use = input_variables  # Use all original variables for now

# For ICC [1/mL]
print("Starting backward feature selection for ICC [1/mL] (Train-Test)...")
selected_features_icc_tt, bfs_scores_icc_tt = backward_feature_selection_train_test(
    train_df=train_df,
    test_df=test_df,
    target_variable='ICC [1/mL]', 
    input_variables_list=input_vars_to_use,
    ar_order=2
)

bfs_train_test_results['ICC [1/mL]'] = {
    'selected_features': selected_features_icc_tt,
    'results': bfs_scores_icc_tt
}

# For HNAC [1/mL]
print("\n" + "="*80)
print("Starting backward feature selection for HNAC [1/mL] (Train-Test)...")
selected_features_hnac_tt, bfs_scores_hnac_tt = backward_feature_selection_train_test(
    train_df=train_df,
    test_df=test_df,
    target_variable='HNAC [1/mL]', 
    input_variables_list=input_vars_to_use,
    ar_order=2
)

bfs_train_test_results['HNAC [1/mL]'] = {
    'selected_features': selected_features_hnac_tt,
    'results': bfs_scores_hnac_tt
}


## Lasso with Train-Test Split

In [None]:
# Import required for this function
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

def lasso_feature_selection_train_test(train_df, test_df, target_variable, input_variables_list, alpha_values=None, cv_folds=5):
    """
    Perform Lasso feature selection with proper train-test split.
    
    Parameters:
    - train_df: Training DataFrame
    - test_df: Test DataFrame
    - target_variable: String, name of the target variable
    - input_variables_list: List of input variable names (without lags)
    - alpha_values: List of alpha values for cross-validation. If None, uses default range
    - cv_folds: Number of cross-validation folds
    
    Returns:
    - selected_features: List of selected feature names
    - lasso_results: Dictionary with Lasso results and performance metrics
    """
    
    # Get all lagged features for input variables
    all_input_features = []
    for var in input_variables_list:
        all_input_features.extend([col for col in train_df.columns if var in col and col != var])
        all_input_features.append(var)  # Include the original variable too
    
    # Get lagged features for target variable (autoregressive terms)
    target_features = [col for col in train_df.columns if target_variable in col and col != target_variable]
    
    # All features to consider
    all_features = all_input_features + target_features
    all_features = list(set(all_features))  # Remove duplicates
    
    # Prepare training data
    X_train = train_df[all_features].copy()
    y_train = train_df[target_variable].copy()
    
    # Prepare test data
    X_test = test_df[all_features].copy()
    y_test = test_df[target_variable].copy()
    
    # Remove any rows with NaN values
    train_mask = ~(X_train.isnull().any(axis=1) | y_train.isnull())
    X_train = X_train[train_mask]
    y_train = y_train[train_mask]
    
    test_mask = ~(X_test.isnull().any(axis=1) | y_test.isnull())
    X_test = X_test[test_mask]
    y_test = y_test[test_mask]
    
    print(f"\\n=== Lasso Feature Selection for {target_variable} (Train-Test) ===")
    print(f"Training set shape: {X_train.shape}")
    print(f"Test set shape: {X_test.shape}")
    print(f"Starting with {X_train.shape[1]} features")
    
    # Fit scaler on training data only
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)  # Apply same scaling to test set
    
    # Set alpha values for cross-validation if not provided
    if alpha_values is None:
        alpha_values = np.logspace(-4, 2, 50)
    
    # Perform cross-validation on training set only
    print("Performing cross-validation on training set to find optimal alpha...")
    lasso_cv = LassoCV(alphas=alpha_values, cv=cv_folds, random_state=42, max_iter=10000)
    lasso_cv.fit(X_train_scaled, y_train)
    
    optimal_alpha = lasso_cv.alpha_
    print(f"Optimal alpha: {optimal_alpha:.6f}")
    
    # Fit Lasso with optimal alpha on training data
    lasso = Lasso(alpha=optimal_alpha, random_state=42, max_iter=10000)
    lasso.fit(X_train_scaled, y_train)
    
    # Get selected features (non-zero coefficients)
    selected_mask = lasso.coef_ != 0
    selected_features = [all_features[i] for i in range(len(all_features)) if selected_mask[i]]
    selected_coefficients = lasso.coef_[selected_mask]
    
    print(f"Number of selected features: {len(selected_features)}")
    print(f"Selected features: {selected_features}")
    
    # Calculate performance metrics
    # Training performance
    y_train_pred = lasso.predict(X_train_scaled)
    train_mse = mean_squared_error(y_train, y_train_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    
    # Test performance
    y_test_pred = lasso.predict(X_test_scaled)
    test_mse = mean_squared_error(y_test, y_test_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    
    print(f"\\nModel Performance:")
    print(f"  Training R²: {train_r2:.6f}")
    print(f"  Training RMSE: {np.sqrt(train_mse):.6f}")
    print(f"  Test R²: {test_r2:.6f}")
    print(f"  Test RMSE: {np.sqrt(test_mse):.6f}")
    
    # Create results dictionary
    lasso_results = {
        'optimal_alpha': optimal_alpha,
        'selected_features': selected_features,
        'selected_coefficients': selected_coefficients,
        'feature_names': all_features,
        'all_coefficients': lasso.coef_,
        'train_r2': train_r2,
        'train_mse': train_mse,
        'train_rmse': np.sqrt(train_mse),
        'test_r2': test_r2,
        'test_mse': test_mse,
        'test_rmse': np.sqrt(test_mse),
        'cv_scores': lasso_cv.mse_path_.mean(axis=1),
        'alpha_values': alpha_values,
        'scaler': scaler,
        'model': lasso,
        'train_predictions': y_train_pred,
        'test_predictions': y_test_pred
    }
    
    return selected_features, lasso_results


In [None]:
# Apply Lasso feature selection with train-test split
lasso_train_test_results = {}

# For ICC [1/mL]
print("Starting Lasso feature selection for ICC [1/mL] (Train-Test)...")
selected_features_icc_lasso_tt, lasso_scores_icc_tt = lasso_feature_selection_train_test(
    train_df=train_df,
    test_df=test_df,
    target_variable='ICC [1/mL]', 
    input_variables_list=input_vars_to_use,
    cv_folds=5
)

lasso_train_test_results['ICC [1/mL]'] = {
    'selected_features': selected_features_icc_lasso_tt,
    'lasso_scores': lasso_scores_icc_tt
}

# For HNAC [1/mL]
print("\n" + "="*80)
print("Starting Lasso feature selection for HNAC [1/mL] (Train-Test)...")
selected_features_hnac_lasso_tt, lasso_scores_hnac_tt = lasso_feature_selection_train_test(
    train_df=train_df,
    test_df=test_df,
    target_variable='HNAC [1/mL]', 
    input_variables_list=input_vars_to_use,
    cv_folds=5
)

lasso_train_test_results['HNAC [1/mL]'] = {
    'selected_features': selected_features_hnac_lasso_tt,
    'lasso_scores': lasso_scores_hnac_tt
}


In [None]:
# Display Lasso results summary for train-test split
print("\n" + "="*100)
print("LASSO FEATURE SELECTION RESULTS SUMMARY (TRAIN-TEST SPLIT)")
print("="*100)

for target in ['ICC [1/mL]', 'HNAC [1/mL]']:
    print(f"\n{target}:")
    print("-" * 50)
    
    selected_features = lasso_train_test_results[target]['selected_features']
    lasso_scores = lasso_train_test_results[target]['lasso_scores']
    
    print(f"Number of selected features: {len(selected_features)}")
    print(f"Selected features: {selected_features}")
    print(f"Optimal alpha: {lasso_scores['optimal_alpha']:.6f}")
    print(f"Train R² Score: {lasso_scores['train_r2']:.6f}")
    print(f"Test R² Score: {lasso_scores['test_r2']:.6f}")
    print(f"Train RMSE: {lasso_scores['train_rmse']:.6f}")
    print(f"Test RMSE: {lasso_scores['test_rmse']:.6f}")
    
    # Display selected features with their coefficients
    print(f"\nSelected Features and Coefficients:")
    for i, (feature, coef) in enumerate(zip(selected_features, lasso_scores['selected_coefficients'])):
        print(f"  {i+1:2d}. {feature:30s} = {coef:10.6f}")
    
    print("\n" + "="*50)


In [None]:
# Plot cross-validation results for alpha selection
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

for i, target in enumerate(['ICC [1/mL]', 'HNAC [1/mL]']):
    lasso_scores = lasso_train_test_results[target]['lasso_scores']
    
    # Plot CV scores vs alpha
    axes[i].semilogx(lasso_scores['alpha_values'], lasso_scores['cv_scores'])
    axes[i].axvline(x=lasso_scores['optimal_alpha'], color='red', linestyle='--', 
                   label=f'Optimal α = {lasso_scores["optimal_alpha"]:.6f}')
    axes[i].set_xlabel('Alpha (Regularization Parameter)')
    axes[i].set_ylabel('Cross-Validation MSE')
    axes[i].set_title(f'Lasso CV Results (Train-Test) - {target}')
    axes[i].legend()
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print final summary
print("\n" + "="*100)
print("FINAL SUMMARY (TRAIN-TEST SPLIT)")
print("="*100)
print("The Lasso feature selection with train-test split provides a more robust")
print("evaluation of model performance by testing on unseen data.")
print("\nKey advantages of Lasso with train-test split:")
print("- Better assessment of model generalization")
print("- More reliable feature selection through cross-validation")
print("- Clear separation between training and test performance")
print("- Helps identify potential overfitting")
print("\nThe train-test approach provides more confidence in the selected features")
print("and their predictive power on new data.")

## Train-Test Results Comparison

In [None]:
# Create comprehensive comparison between full dataset and train-test split approaches
print("\n" + "="*100)
print("COMPREHENSIVE COMPARISON: FULL DATASET vs TRAIN-TEST SPLIT")
print("="*100)

comparison_data = []

for target in ['ICC [1/mL]', 'HNAC [1/mL]']:
    target_short = target.replace(' [1/mL]', '')
    
    # Full dataset results
    bfs_full_features = len(bfs_results[target]['selected_features'])
    bfs_full_aic = bfs_results[target]['aic_scores']['final']['aic']
    
    lasso_full_features = len(lasso_results[target]['selected_features'])
    lasso_full_r2 = lasso_results[target]['lasso_scores']['r2_score']
    
    # Train-test results
    bfs_tt_features = len(bfs_train_test_results[target]['selected_features'])
    bfs_tt_test_r2 = bfs_train_test_results[target]['results']['test_r2']
    bfs_tt_test_rmse = bfs_train_test_results[target]['results']['test_rmse']
    
    lasso_tt_features = len(lasso_train_test_results[target]['selected_features'])
    lasso_tt_train_r2 = lasso_train_test_results[target]['lasso_scores']['train_r2']
    lasso_tt_test_r2 = lasso_train_test_results[target]['lasso_scores']['test_r2']
    lasso_tt_test_rmse = lasso_train_test_results[target]['lasso_scores']['test_rmse']
    
    comparison_data.append({
        'Target': target_short,
        'BFS_Full_Features': bfs_full_features,
        'BFS_Full_AIC': round(bfs_full_aic, 4),
        'BFS_TT_Features': bfs_tt_features,
        'BFS_TT_Test_R2': round(bfs_tt_test_r2, 6),
        'BFS_TT_Test_RMSE': round(bfs_tt_test_rmse, 4),
        'Lasso_Full_Features': lasso_full_features,
        'Lasso_Full_R2': round(lasso_full_r2, 6),
        'Lasso_TT_Features': lasso_tt_features,
        'Lasso_TT_Train_R2': round(lasso_tt_train_r2, 6),
        'Lasso_TT_Test_R2': round(lasso_tt_test_r2, 6),
        'Lasso_TT_Test_RMSE': round(lasso_tt_test_rmse, 4)
    })

comparison_df = pd.DataFrame(comparison_data)
print(comparison_df.to_string(index=False))

# Feature overlap analysis between methods and approaches
print("\n\n" + "="*100)
print("FEATURE OVERLAP ANALYSIS")
print("="*100)

for target in ['ICC [1/mL]', 'HNAC [1/mL]']:
    print(f"\n{target}:")
    print("-" * 50)
    
    # Get feature sets
    bfs_full = set(bfs_results[target]['selected_features'])
    bfs_tt = set(bfs_train_test_results[target]['selected_features'])
    lasso_full = set(lasso_results[target]['selected_features'])
    lasso_tt = set(lasso_train_test_results[target]['selected_features'])
    
    # Calculate overlaps
    bfs_overlap = bfs_full.intersection(bfs_tt)
    lasso_overlap = lasso_full.intersection(lasso_tt)
    all_methods_overlap = bfs_full.intersection(bfs_tt).intersection(lasso_full).intersection(lasso_tt)
    
    print(f"BFS Full Dataset ({len(bfs_full)}): {sorted(list(bfs_full))}")
    print(f"BFS Train-Test ({len(bfs_tt)}): {sorted(list(bfs_tt))}")
    print(f"BFS Overlap: {len(bfs_overlap)}/{min(len(bfs_full), len(bfs_tt))} ({len(bfs_overlap)/min(len(bfs_full), len(bfs_tt))*100:.1f}%)")
    
    print(f"\\nLasso Full Dataset ({len(lasso_full)}): {sorted(list(lasso_full))}")
    print(f"Lasso Train-Test ({len(lasso_tt)}): {sorted(list(lasso_tt))}")
    print(f"Lasso Overlap: {len(lasso_overlap)}/{min(len(lasso_full), len(lasso_tt))} ({len(lasso_overlap)/min(len(lasso_full), len(lasso_tt))*100:.1f}%)")
    
    print(f"\\nConsistent across ALL methods ({len(all_methods_overlap)}): {sorted(list(all_methods_overlap))}")

print("\n" + "="*100)


In [None]:
%%script false --no-raise-error
# avoid to run this cell


# Save train-test results and create final summary
from datetime import datetime

print("\n" + "="*100)
print("SAVING TRAIN-TEST RESULTS AND FINAL SUMMARY")
print("="*100)

# Create train-test summary DataFrame
train_test_summary = pd.DataFrame([
    {
        'Target_Variable': target,
        'BFS_TT_Features_Count': len(bfs_train_test_results[target]['selected_features']),
        'BFS_TT_Test_R2': round(bfs_train_test_results[target]['results']['test_r2'], 6),
        'BFS_TT_Test_RMSE': round(bfs_train_test_results[target]['results']['test_rmse'], 4),
        'BFS_TT_Selected_Features': '; '.join(sorted(bfs_train_test_results[target]['selected_features'])),
        'Lasso_TT_Features_Count': len(lasso_train_test_results[target]['selected_features']),
        'Lasso_TT_Train_R2': round(lasso_train_test_results[target]['lasso_scores']['train_r2'], 6),
        'Lasso_TT_Test_R2': round(lasso_train_test_results[target]['lasso_scores']['test_r2'], 6),
        'Lasso_TT_Test_RMSE': round(lasso_train_test_results[target]['lasso_scores']['test_rmse'], 4),
        'Lasso_TT_Optimal_Alpha': round(lasso_train_test_results[target]['lasso_scores']['optimal_alpha'], 6),
        'Lasso_TT_Selected_Features': '; '.join(sorted(lasso_train_test_results[target]['selected_features']))
    }
    for target in ['ICC [1/mL]', 'HNAC [1/mL]']
])

# Save to Excel with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
train_test_filename = f"feature_selection_train_test_results_{timestamp}.xlsx"

try:
    with pd.ExcelWriter(train_test_filename, engine='openpyxl') as writer:
        # Save train-test specific results
        train_test_summary.to_excel(writer, sheet_name='Train_Test_Summary', index=False)
        
        # Save comparison between full and train-test
        comparison_df.to_excel(writer, sheet_name='Full_vs_TrainTest_Comparison', index=False)
        
        # Save detailed BFS train-test results
        bfs_tt_detailed = pd.DataFrame([
            {
                'Target': target,
                'Selected_Features': '; '.join(bfs_train_test_results[target]['selected_features']),
                'Feature_Count': len(bfs_train_test_results[target]['selected_features']),

                'Train_AIC': bfs_train_test_results[target]['results']['train_aic'],
                'Test_R2': bfs_train_test_results[target]['results']['test_r2'],
                'Test_RMSE': bfs_train_test_results[target]['results']['test_rmse']
            }
            for target in ['ICC [1/mL]', 'HNAC [1/mL]']
        ])
        bfs_tt_detailed.to_excel(writer, sheet_name='BFS_TrainTest_Detailed', index=False)
        
        # Save detailed Lasso train-test results
        lasso_tt_detailed = pd.DataFrame([
            {
                'Target': target,
                'Selected_Features': '; '.join(lasso_train_test_results[target]['selected_features']),
                'Feature_Count': len(lasso_train_test_results[target]['selected_features']),
                'Optimal_Alpha': lasso_train_test_results[target]['lasso_scores']['optimal_alpha'],
                'Train_R2': lasso_train_test_results[target]['lasso_scores']['train_r2'],
                'Train_RMSE': lasso_train_test_results[target]['lasso_scores']['train_rmse'],
                'Test_R2': lasso_train_test_results[target]['lasso_scores']['test_r2'],
                'Test_RMSE': lasso_train_test_results[target]['lasso_scores']['test_rmse']
            }
            for target in ['ICC [1/mL]', 'HNAC [1/mL]']
        ])
        lasso_tt_detailed.to_excel(writer, sheet_name='Lasso_TrainTest_Detailed', index=False)
    
    print(f"✓ Train-test results saved to: {train_test_filename}")
    
except Exception as e:
    print(f"✗ Error saving train-test results: {e}")

# Final recommendations
print(f"\n\n📊 FINAL RECOMMENDATIONS")
print("="*50)
print("Based on the train-test split analysis:")
print("\\n1. MODEL GENERALIZATION:")
print("   - Use test R² scores to assess true model performance")
print("   - Compare train vs test performance to detect overfitting")
print("\\n2. FEATURE SELECTION RELIABILITY:")
print("   - Features selected consistently across full and train-test approaches are most reliable")
print("   - Consider features that appear in multiple methods for robustness")
print("\\n3. METHOD COMPARISON:")
print("   - Lasso provides both train and test performance metrics")
print("   - BFS with train-test gives better generalization estimates")
print("\\n4. NEXT STEPS:")
print("   - Use the most consistent features for final model development")
print("   - Consider ensemble approaches combining selected features")
print("   - Validate on additional external datasets if available")

print(f"\n📁 All results saved to:")
print(f"   - Full dataset analysis: feature_selection_results_[timestamp].xlsx")
print(f"   - Train-test analysis: {train_test_filename}")

print("\\n" + "="*100)


In [None]:
# Final consolidated summary display
print("\n" + "="*100)
print("FINAL CONSOLIDATED SUMMARY")
print("="*100)

# Display key metrics side by side
print("\n1. OVERVIEW COMPARISON")
print("-" * 50)
overview_df = comprehensive_summary[['Target_Variable', 'BFS_Features_Count', 'BFS_Final_AIC', 
                                   'Lasso_Features_Count', 'Lasso_R2_Score', 'Overlap_Count', 
                                   'Overlap_Pct_vs_BFS']].copy()
overview_df.columns = ['Target', 'BFS_Count', 'BFS_AIC', 'Lasso_Count', 'Lasso_R2', 'Overlap', 'Overlap_%']
print(overview_df.to_string(index=False))

# Most consistently selected features
print("\n\n2. MOST CONSISTENTLY SELECTED FEATURES")
print("-" * 50)
high_consistency = detailed_feature_summary[detailed_feature_summary['Selection_Consistency'] == 'High']
if not high_consistency.empty:
    print(f"Features selected by both methods for both targets ({len(high_consistency)} features):")
    for _, row in high_consistency.iterrows():
        print(f"  • {row['Feature_Name']} ({row['Base_Variable']} - {row['Feature_Type']})")
else:
    print("No features were selected by both methods for both targets.")

# Medium consistency features
medium_consistency = detailed_feature_summary[detailed_feature_summary['Selection_Consistency'] == 'Medium']
if not medium_consistency.empty:
    print(f"\nMedium consistency features ({len(medium_consistency)} features):")
    for _, row in medium_consistency.iterrows():
        print(f"  • {row['Feature_Name']} ({row['Base_Variable']} - {row['Feature_Type']})")

# Top features by absolute Lasso coefficients
print("\n\n3. TOP FEATURES BY LASSO COEFFICIENT MAGNITUDE")
print("-" * 50)
for target in ['ICC [1/mL]', 'HNAC [1/mL]']:
    target_short = target.replace(' [1/mL]', '').replace(' ', '_')
    coef_col = f'{target_short}_Lasso_Coefficient'
    
    target_features = detailed_feature_summary[detailed_feature_summary[coef_col] != 0].copy()
    if not target_features.empty:
        target_features['abs_coef'] = abs(target_features[coef_col])
        target_features = target_features.sort_values('abs_coef', ascending=False).head(5)
        
        print(f"\n{target} - Top 5 by coefficient magnitude:")
        for _, row in target_features.iterrows():
            coef_val = row[coef_col]
            print(f"  • {row['Feature_Name']:30s} = {coef_val:8.4f}")

# Feature type analysis
print("\n\n4. FEATURE TYPE ANALYSIS")
print("-" * 50)
feature_type_summary = detailed_feature_summary.groupby(['Base_Variable', 'Feature_Type']).agg({
    'Total_Selections': 'sum',
    'Feature_Name': 'count'
}).rename(columns={'Feature_Name': 'Count'}).reset_index()

base_var_summary = feature_type_summary.groupby('Base_Variable').agg({
    'Total_Selections': 'sum',
    'Count': 'sum'
}).sort_values('Total_Selections', ascending=False)

print("Variable importance by total selections across all methods and targets:")
for var, data in base_var_summary.head(10).iterrows():
    print(f"  • {var:25s}: {data['Total_Selections']:2d} selections ({data['Count']:2d} features)")

print(f"\n\n📊 SUMMARY STATISTICS:")
print(f"   Total unique features analyzed: {len(detailed_feature_summary)}")
print(f"   Features with high consistency: {len(high_consistency)}")
print(f"   Features with medium consistency: {len(medium_consistency)}")
print(f"   Average BFS features per target: {comprehensive_summary['BFS_Features_Count'].mean():.1f}")
print(f"   Average Lasso features per target: {comprehensive_summary['Lasso_Features_Count'].mean():.1f}")
print(f"   Average overlap percentage: {comprehensive_summary['Overlap_Pct_vs_BFS'].mean():.1f}%")

print(f"\n📁 Data saved to Excel file: {output_filename}")
print("   Contains 4 sheets: Summary_by_Target, Detailed_Feature_Analysis, BFS_Results, Lasso_Results")

print("\n" + "="*100)
