In [None]:
# Import libraries

import pandas as pd
import numpy as np
from scipy.stats import spearmanr
from scipy.stats import skew
from scipy.stats import kurtosis

import os
import zipfile
import shutil
import matplotlib.pyplot as plt
import pybigtools
from joblib import Parallel, delayed
import multiprocessing

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest, f_regression, r_regression

from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Matern

In [None]:
# Unzip and organize data files

unzip = False

if unzip:

    # Define paths
    source_dir = 'ML4G_Project_1_Data'  # Path to directory with zip files
    target_dir = 'data'  # Target directory for extracted files

    # Create target directory if it doesn't exist
    os.makedirs(target_dir, exist_ok=True)

    # Create folders for each cell line
    cell_lines = ['X1', 'X2', 'X3']
    for cell_line in cell_lines:
        cell_line_dir = os.path.join(target_dir, cell_line)
        os.makedirs(cell_line_dir, exist_ok=True)

    # Get all zip files in the source directory (excluding sample.zip)
    zip_files = [f for f in os.listdir(source_dir) if f.endswith('.zip') and f != 'sample.zip']

    # Process each zip file separately
    for zip_file in zip_files:
        zip_path = os.path.join(source_dir, zip_file)
        zip_name = os.path.splitext(zip_file)[0]
        print(f"Processing {zip_file}...")
        
        # Extract to a unique temporary directory for this zip file
        temp_extract_dir = os.path.join(target_dir, f'temp_{zip_name}')
        os.makedirs(temp_extract_dir, exist_ok=True)
        
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(temp_extract_dir)
        
        # Walk through all extracted files from this zip and organize them
        for root, dirs, files in os.walk(temp_extract_dir):
            if 'MACOSX' in root:
                continue
            for file in files:
                source_file = os.path.join(root, file)
                
                # Handle CAGE-train files separately
                if 'CAGE-train' in root or file.endswith('.tsv'):
                    # Keep CAGE-train files in their own folder
                    cage_train_target = os.path.join(target_dir, 'CAGE-train')
                    os.makedirs(cage_train_target, exist_ok=True)
                    
                    # Preserve the relative path structure for CAGE files
                    rel_path = os.path.relpath(source_file, temp_extract_dir)
                    target_file = os.path.join(cage_train_target, rel_path)
                    os.makedirs(os.path.dirname(target_file), exist_ok=True)
                    shutil.copy2(source_file, target_file)
                    print(f"  Copied {file} to CAGE-train/")
                    continue
                
                # Determine which cell line this file belongs to
                cell_line_found = False
                for cell_line in cell_lines:
                    if cell_line in file:
                        file_lower = file.lower()
                        
                        # Get file extension (handle both .bw and .bigwig)
                        if file.endswith('.bw') or file.endswith('.bigwig'):
                            ext = '.bw'
                        elif file.endswith('.bed'):
                            ext = '.bed'
                        else:
                            continue
                        
                        # Determine the mark/assay type from the zip file name
                        zip_lower = zip_name.lower()
                        if 'dnase' in zip_lower:
                            data_type = 'DNase'
                        elif 'h3k27ac' in zip_lower:
                            data_type = 'H3K27ac'
                        elif 'h3k27me3' in zip_lower:
                            data_type = 'H3K27me3'
                        elif 'h3k36me3' in zip_lower:
                            data_type = 'H3K36me3'
                        elif 'h3k4me1' in zip_lower:
                            data_type = 'H3K4me1'
                        elif 'h3k4me3' in zip_lower:
                            data_type = 'H3K4me3'
                        elif 'h3k9me3' in zip_lower:
                            data_type = 'H3K9me3'
                        else:
                            # Skip if we can't identify the data type
                            continue
                        
                        # Create new filename: {data_type}_{cell_line}{ext}
                        new_filename = f"{data_type}_{cell_line}{ext}"
                        target_file = os.path.join(target_dir, cell_line, new_filename)
                        
                        # Copy the file to the new location
                        shutil.copy2(source_file, target_file)
                        print(f"  Copied {file} -> {cell_line}/{new_filename}")
                        cell_line_found = True
                        break
        
        # Clean up this zip's temporary directory
        shutil.rmtree(temp_extract_dir)
        print(f"  Cleaned up temp_{zip_name}")

    print(f"\nAll {len(zip_files)} zip files have been extracted and organized in {target_dir}")
    print(f"Structure: data/{'{X1,X2,X3}'}/{'{DataType}_{CellLine}.{bw,bed}'}")

In [None]:
# Data paths etc.
data_paths_bw = {
   'X1': {
        'DNase': 'data/X1/DNase_X1.bw',
        'H3K27ac': 'data/X1/H3K27ac_X1.bw',
        'H3K27me3': 'data/X1/H3K27me3_X1.bw',
        'H3K36me3': 'data/X1/H3K36me3_X1.bw',
        'H3K4me1': 'data/X1/H3K4me1_X1.bw',
        'H3K4me3': 'data/X1/H3K4me3_X1.bw',
        'H3K9me3': 'data/X1/H3K9me3_X1.bw'
    },
    'X2': {
        'DNase': 'data/X2/DNase_X2.bw',
        'H3K27ac': 'data/X2/H3K27ac_X2.bw',
        'H3K27me3': 'data/X2/H3K27me3_X2.bw',
        'H3K36me3': 'data/X2/H3K36me3_X2.bw',
        'H3K4me1': 'data/X2/H3K4me1_X2.bw',
        'H3K4me3': 'data/X2/H3K4me3_X2.bw',
        'H3K9me3': 'data/X2/H3K9me3_X2.bw'
    },
    'X3': {
        'DNase': 'data/X3/DNase_X3.bw',
        'H3K27ac': 'data/X3/H3K27ac_X3.bw',
        'H3K27me3': 'data/X3/H3K27me3_X3.bw',
        'H3K36me3': 'data/X3/H3K36me3_X3.bw',
        'H3K4me1': 'data/X3/H3K4me1_X3.bw',
        'H3K4me3': 'data/X3/H3K4me3_X3.bw',
        'H3K9me3': 'data/X3/H3K9me3_X3.bw'
    }
}

data_paths_bed = {
     'X1': {
        'DNase': 'data/X1/DNase_X1.bed',
        'H3K27ac': 'data/X1/H3K27ac_X1.bed',
        'H3K27me3': 'data/X1/H3K27me3_X1.bed',
        'H3K36me3': 'data/X1/H3K36me3_X1.bed',
        'H3K4me1': 'data/X1/H3K4me1_X1.bed',
        'H3K4me3': 'data/X1/H3K4me3_X1.bed',
        'H3K9me3': 'data/X1/H3K9me3_X1.bed'
    },
    'X2': {
        'DNase': 'data/X2/DNase_X2.bed',
        'H3K27ac': 'data/X2/H3K27ac_X2.bed',
        'H3K27me3': 'data/X2/H3K27me3_X2.bed',
        'H3K36me3': 'data/X2/H3K36me3_X2.bed',
        'H3K4me1': 'data/X2/H3K4me1_X2.bed',
        'H3K4me3': 'data/X2/H3K4me3_X2.bed',
        'H3K9me3': 'data/X2/H3K9me3_X2.bed'
    },
    'X3': {
        'DNase': 'data/X3/DNase_X3.bed',
        'H3K27ac': 'data/X3/H3K27ac_X3.bed',
        'H3K27me3': 'data/X3/H3K27me3_X3.bed',
        'H3K36me3': 'data/X3/H3K36me3_X3.bed',
        'H3K4me1': 'data/X3/H3K4me1_X3.bed',
        'H3K4me3': 'data/X3/H3K4me3_X3.bed',
        'H3K9me3': 'data/X3/H3K9me3_X3.bed'
    }
}

gene_paths = {
    'X1': {
        'train':{
            'info': 'data/CAGE-train/CAGE-train/X1_train_info.tsv',
            'target': 'data/CAGE-train/CAGE-train/X1_train_y.tsv'
        },
        'validation':{
            'info': 'data/CAGE-train/CAGE-train/X1_val_info.tsv',
            'target': 'data/CAGE-train/CAGE-train/X1_val_y.tsv'
        }
    },
    'X2': {
        'train':{
            'info': 'data/CAGE-train/CAGE-train/X2_train_info.tsv',
            'target': 'data/CAGE-train/CAGE-train/X2_train_y.tsv'
        },
        'validation':{
            'info': 'data/CAGE-train/CAGE-train/X2_val_info.tsv',
            'target': 'data/CAGE-train/CAGE-train/X2_val_y.tsv'
        }
    },
    'X3': 'X3_test_info.tsv'
}

In [None]:
def calculate_features(vals, wide_values=None, weights=None, num_bins=10, bin_stats=['mean']):
    """
    Calculate all features for a given signal array.
    
    Parameters:
    - vals: numpy array of signal values in the main window
    - wide_values: optional array for weighted sum calculation
    - weights: optional weights for wide_values
    - num_bins: number of bins for spatial features
    - bin_stats: list of statistics to calculate per bin. 
                 Options: 'mean', 'min', 'max', 'std', 'median', 'sum'
    
    Returns:
    - list of feature values
    - list of feature names
    """
    features = []
    feature_names = []
    
    # Basic statistics (global)
    if vals.size == 0:
        basic_features = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
        basic_names = ["avg", "min", "max", "std", "med", "sum"]
    else:
        basic_features = [
            np.mean(vals),
            np.min(vals),
            np.max(vals),
            np.std(vals),
            np.median(vals),
            np.sum(vals)
        ]
        basic_names = ["avg", "min", "max", "std", "med", "sum"]
    
    features.extend(basic_features)
    feature_names.extend(basic_names)
    
    # Weighted sum (distance-weighted)
    if wide_values is not None and weights is not None:
        w_sum = np.sum(wide_values * weights)
    else:
        w_sum = 0.0
    features.append(w_sum)
    feature_names.append("w_sum")
    
    # Binned features with multiple statistics
    bin_size = len(vals) // num_bins if len(vals) > 0 else 0
    
    # Mapping of stat names to functions
    stat_functions = {
        'mean': np.mean,
        'min': np.min,
        'max': np.max,
        'std': np.std,
        'median': np.median,
        'sum': np.sum
    }
    
    if bin_size > 0 and len(vals) >= num_bins:
        for b in range(num_bins):
            bin_start = b * bin_size
            bin_end = (b + 1) * bin_size if b < num_bins - 1 else len(vals)
            bin_vals = vals[bin_start:bin_end]
            
            if len(bin_vals) > 0:
                for stat_name in bin_stats:
                    stat_func = stat_functions[stat_name]
                    features.append(stat_func(bin_vals))
                    feature_names.append(f"bin{b}_{stat_name}")
            else:
                # Empty bin
                for stat_name in bin_stats:
                    features.append(0.0)
                    feature_names.append(f"bin{b}_{stat_name}")
    else:
        # Not enough data for bins
        for b in range(num_bins):
            for stat_name in bin_stats:
                features.append(0.0)
                feature_names.append(f"bin{b}_{stat_name}")
    
    return features, feature_names


def extract_all_features(gene_info_df, cell_line, window=10000, bw_paths=None, 
                         num_bins=5, bin_stats=['mean', 'max', 'std']):
    """
    Extract summary statistics from bigWig files for each gene in `gene_info_df` and
    each mark defined in `bw_paths` (or `data_paths_bw[cell_line]` by default).

    Parameters:
    - gene_info_df: pd.DataFrame with at least columns ['chr', 'tss'] and optionally 'gene_name'.
    - cell_line: one of the keys in data_paths_bw (e.g. 'X1').
    - window: int, number of bases upstream/downstream of TSS to include (default 10000 bases).
    - bw_paths: optional dict of mark->path. If None, uses data_paths_bw[cell_line].
    - num_bins: int, number of bins for spatial features (default 10).
    - bin_stats: list of statistics to calculate per bin (default ['mean', 'max', 'std']).
                 Options: 'mean', 'min', 'max', 'std', 'median', 'sum'

    Returns:
    - pd.DataFrame: index matches gene order (uses 'gene_name' if present), 
                    columns like '<MARK>_avg', '<MARK>_bin0_mean', '<MARK>_bin0_max', etc.
    """
    if bw_paths is None:
        bw_paths = data_paths_bw.get(cell_line, {})

    # Prepare index / gene identifiers
    if 'gene_name' in gene_info_df.columns:
        gene_names = gene_info_df['gene_name'].astype(str).tolist()
    else:
        gene_names = gene_info_df.index.astype(str).tolist()

    feature_frames = []

    for mark, path in bw_paths.items():
        print(f"Extracting features for {mark} from {path}...")

        # We'll determine column names from the first successful feature calculation
        cols = None
        rows = []

        # Check file exists
        try:
            bw = pybigtools.open(path)
            if bw is None:
                raise FileNotFoundError(f"Could not open bigWig: {path}")
        except Exception as e:
            print(f"  Warning: cannot open {path}: {e}. Filling zeros for this mark.")
            # Get feature names from a dummy calculation
            dummy_features, dummy_names = calculate_features(
                np.array([]), num_bins=num_bins, bin_stats=bin_stats
            )
            cols = [mark + "_" + name for name in dummy_names]
            rows = [[0.0] * len(cols) for _ in range(len(gene_names))]
            feature_frames.append(pd.DataFrame(rows, columns=cols, index=gene_names))
            continue

        # Iterate genes in order and extract statistics
        for i, row in enumerate(gene_info_df.itertuples(index=False)):
            chrom = str(getattr(row, 'chr')) if 'chr' in gene_info_df.columns else str(row[0])
            tss_start = int(getattr(row, 'TSS_start'))
            tss_end = int(getattr(row, 'TSS_end'))
            start = tss_start - window
            end = tss_end + window
            
            # Wide window for weighted features
            wide_window = 10000
            wide_bins = (wide_window * 2) // 10
            EPS_DIST = 10

            try:
                # Extract main window values
                values = bw.values(chrom, start, end, oob=0.0)
                
                # Extract wide window values for weighted sum
                wide_values = bw.values(
                    chrom, 
                    tss_start - wide_window, 
                    tss_start + wide_window, 
                    oob=0.0, 
                    bins=wide_bins
                )
                
                # Calculate distance weights
                relative_pos = np.arange(wide_window * 2)[::((wide_window * 2) // wide_bins)]
                distance_to_tss = np.abs(relative_pos - wide_window)
                weights = 1.0 / (distance_to_tss + EPS_DIST)
                close_mask = distance_to_tss <= 1000
                weights[close_mask] = 0.0

                # Handle None values from bigWig
                if values is None:
                    vals = np.array([], dtype=float)
                else:
                    vals = values
                
                # Calculate all features
                gene_features, feature_names = calculate_features(
                    vals, 
                    wide_values=wide_values, 
                    weights=weights,
                    num_bins=num_bins,
                    bin_stats=bin_stats
                )
                
                # Set column names on first successful extraction
                if cols is None:
                    cols = [mark + "_" + name for name in feature_names]
                
                rows.append(gene_features)
                
            except Exception as e:
                # Keep alignment with genes even when extraction fails
                print(f"  Warning: failed for {chrom}:{start}-{end} ({e}). Using zeros.")
                if cols is None:
                    # Get column names from dummy calculation
                    dummy_features, dummy_names = calculate_features(
                        np.array([]), num_bins=num_bins, bin_stats=bin_stats
                    )
                    cols = [mark + "_" + name for name in dummy_names]
                rows.append([0.0] * len(cols))

        # Close bigWig handle
        try:
            bw.close()
        except Exception:
            pass

        # Create dataframe for this mark
        mark_df = pd.DataFrame(rows, columns=cols, index=gene_names)
        feature_frames.append(mark_df)

    # Concatenate horizontally and ensure index is gene names
    if feature_frames:
        all_features_df = pd.concat(feature_frames, axis=1)
        all_features_df.index.name = 'gene_name'
    else:
        all_features_df = pd.DataFrame(index=gene_names)

    return all_features_df

In [None]:
# Approach 1: Use bw files, compute average, min, max, std over a region that spans the transcription start site (TSS) +/- 10kb

# Idea: Function that given cell line and type (train/val) extracts features and targets
def get_dataset(cell_line, set_type, window=10000, num_bins=10):

    gene_info_df = pd.read_csv(gene_paths[cell_line][set_type]['info'], sep='\t')
    gene_target_df = pd.read_csv(gene_paths[cell_line][set_type]['target'], sep='\t')
    
    # Pass window (in bases) through to feature extraction so you can vary it between runs
    features = extract_all_features(gene_info_df, cell_line, window=window, num_bins=num_bins)
    # targets = gene_target_df['gex'].values
    targets = np.log2(gene_target_df['gex'].values + 0.001) # log scaling targets
    # targets = np.arcsinh(gene_target_df['gex'].values) # arcsinh scaling targets

    # features = (features+0.001).apply(np.log2) # log scale features
    
    return features, targets


In [None]:
# Faster: parallel loader

def get_data_dict_joblib(n_jobs=None, window=10000, num_bins=5):
    """
    Notebook-friendly parallel loader using joblib (loky backend).
    - n_jobs: number of parallel jobs (default = min(CPU, num_tasks))
    - window: forwarded to get_dataset/get_dataset_cached
    """
    cell_lines = ['X1', 'X2']
    splits = ['train', 'validation']
    tasks = [(c, s) for c in cell_lines for s in splits]
    num_tasks = len(tasks)
    if n_jobs is None:
        n_jobs = min(multiprocessing.cpu_count(), num_tasks)
    else:
        n_jobs = max(1, min(n_jobs, num_tasks))

    # joblib will use loky by default in recent versions; force loky backend for robustness in notebooks
    results = Parallel(n_jobs=n_jobs, backend='loky')(
        delayed(get_dataset)(c, s, window, num_bins) for (c, s) in tasks
    )

    data_dict = {c: {} for c in cell_lines}
    for (c, s), res in zip(tasks, results):
        data_dict[c][s] = res

    return data_dict

In [None]:
def build_large_data_dict(windows=[10000], num_bins_list=[5]):
    data_dict = {}
    for window in windows:
        for num_bins in num_bins_list:
            key = f"win_{window}__bins_{num_bins}"
            print(f"Loading data for {key}...")
            data_dict[key] = get_data_dict_joblib(window=window, num_bins=num_bins)
    return data_dict

In [None]:
def evaluate_model(model, data_dict):
    """
    Evaluate model performance across different training and validation combinations.
    
    Pipeline:
    1) Get all data (X1 train/val, X2 train/val)
    2) Train on: a) X1 only, b) X2 only, c) X1+X2 combined
    3) Validate each trained model on both X1 and X2 validation sets
    
    Parameters:
    - model: sklearn-compatible model instance
    - data_dict: nested dict with structure {cell_line: {'train': (X, y), 'validation': (X, y)}}
    
    Returns:
    - results: dict with keys (train_cell_line, val_cell_line) and values containing metrics
    """
    results = {}
    
    # Define training scenarios
    training_scenarios = [
        ('X1', ['X1']),
        ('X2', ['X2']),
        ('X1+X2', ['X1', 'X2'])
    ]
    
    for scenario_name, train_cell_lines in training_scenarios:
        # Prepare training data
        X_train, y_train = _combine_datasets(data_dict, train_cell_lines, split='train')
        
        # Scale features
        scaler = StandardScaler()
        scaler.set_output(transform="pandas")
        X_train_scaled = scaler.fit_transform(X_train)
        
        # Feature selection (currently using all features)
        num_features = X_train_scaled.shape[1]
        selector = SelectKBest(f_regression, k=num_features)
        X_train_reduced = selector.fit_transform(X_train_scaled, y_train)
        
        # Train model
        model.fit(X_train_reduced, y_train)
        
        # Evaluate on both validation sets
        for val_cell_line in ['X1', 'X2']:
            metrics = _evaluate_on_validation(
                model, 
                data_dict[val_cell_line]['validation'],
                scaler,
                selector
            )
            
            results[(scenario_name, val_cell_line)] = metrics
    
    return results


def _combine_datasets(data_dict, cell_lines, split='train'):
    """
    Combine data from multiple cell lines.
    
    Parameters:
    - data_dict: nested dict with cell line data
    - cell_lines: list of cell line names to combine
    - split: 'train' or 'validation'
    
    Returns:
    - X: combined feature DataFrame
    - y: combined target array
    """
    if len(cell_lines) == 1:
        return data_dict[cell_lines[0]][split]
    
    X_combined = pd.concat([data_dict[cell][split][0] for cell in cell_lines])
    y_combined = np.concatenate([data_dict[cell][split][1] for cell in cell_lines])
    
    return X_combined, y_combined


def _evaluate_on_validation(model, validation_data, scaler, selector):
    """
    Evaluate model on a validation set.
    
    Parameters:
    - model: trained model
    - validation_data: tuple of (X_val, y_val)
    - scaler: fitted StandardScaler
    - selector: fitted feature selector
    
    Returns:
    - dict with MSE, R^2, and Spearman's rho
    """
    X_val, y_val = validation_data
    
    # Transform validation data using fitted scaler and selector
    X_val_scaled = scaler.transform(X_val)
    X_val_reduced = selector.transform(X_val_scaled)
    
    # Predict
    y_val_pred = model.predict(X_val_reduced)
    
    # Calculate metrics
    mse = mean_squared_error(y_val, y_val_pred)
    r2 = r2_score(y_val, y_val_pred)
    spearmanr_corr, _ = spearmanr(y_val, y_val_pred)
    
    return {
        'MSE': mse,
        'R^2': r2,
        "Spearman's rho": spearmanr_corr
    }


def print_evaluation_results(results):
    """
    Pretty print evaluation results.
    
    Parameters:
    - results: dict returned by evaluate_model
    """
    print("\nModel Evaluation Results")
    print("=" * 70)
    
    for (train_scenario, val_cell_line), metrics in sorted(results.items()):
        print(f"\nTrained on: {train_scenario} | Validated on: {val_cell_line}")
        print("-" * 70)
        for metric_name, value in metrics.items():
            print(f"  {metric_name:20s}: {value:.4f}")

In [None]:
def collect_results_to_dataframe(large_data_dict, models):
    """
    Evaluate multiple models on multiple datasets and collect results in a DataFrame.
    
    Parameters:
    - large_data_dict: dict of {config_key: data_dict}
    - models: dict of {model_name: model_instance}
    
    Returns:
    - pd.DataFrame with columns: Config, Model, Train_On, Val_On, MSE, R^2, Spearman's rho
    """
    all_results = []
    
    for config_key, data_dict in large_data_dict.items():
        print(f"\nEvaluating models with config: {config_key}")
        print("=" * 70)
        
        for model_name, model in models.items():
            print(f"  Model: {model_name}")
            
            # Evaluate model
            results = evaluate_model(model, data_dict)
            
            # Convert results to rows
            for (train_scenario, val_cell_line), metrics in results.items():
                row = {
                    'Config': config_key,
                    'Model': model_name,
                    'Train_On': train_scenario,
                    'Val_On': val_cell_line,
                    'MSE': metrics['MSE'],
                    'R^2': metrics['R^2'],
                    "Spearman's rho": metrics["Spearman's rho"]
                }
                all_results.append(row)

                spearman_corr = metrics["Spearman's rho"]
                # Print progress
                print(f"    {train_scenario} → {val_cell_line}: "
                      f"R²={metrics['R^2']:.4f}, "
                      f"Spearman={spearman_corr:.4f}")

    # Convert to DataFrame
    results_df = pd.DataFrame(all_results)
    
    return results_df

In [None]:
window_sizes = [100, 300, 500, 1000]
bins = [5, 10]

large_data_dict = build_large_data_dict(windows=window_sizes, num_bins_list=bins)

In [None]:
# Usage
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=0.1),
    'HistGradientBoosting': HistGradientBoostingRegressor(random_state=42),
    'XGBoost': xgb.XGBRegressor(n_estimators=100, n_jobs=-1, colsample_bytree=0.5, 
                                max_depth=2, random_state=42),
    'LightGBM': lgb.LGBMRegressor(n_estimators=100, random_state=42, n_jobs=-1, 
                                  colsample_bytree=0.8),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1, 
                                          max_depth=10, max_features=0.1),
}

# Collect all results
results_df = collect_results_to_dataframe(large_data_dict, models)

print("Results DataFrame:")
print(results_df.head())

In [None]:
# Train best model on all data and generate predictions for X3