# Distilling multiple models into a unified framework

Here we will train a distilled model on the predictions, or mean predictions from an ensemble of models trained on different folds.
The following steps will be used:
1. Create Training data

    a. Make predictions for original sequences

    b. Generate new sequences from the genome to make predictions

    c. Generate new sequences with variants

2. Take the mean of created data
3. Retrain the model, use artificial sequences for validation set

## 1. Create Training data from 10 fold models

### a. Make predictions for original sequences with all 10 models

In [None]:
import numpy as np
import os
import sys

drgclis = os.path.expanduser('~/Git/DRG/scripts/')

def generate_ensemble_predictions(modeldict, input_file, outpath, device='cpu', drgclis_path=None):
    """
    Generate predictions from multiple models and compute their mean.
    
    Parameters:
    -----------
    modeldict : dict
        Dictionary mapping fold names to model paths
    input_file : str
        Path to the input file for predictions
    outpath : str
        Output directory for saving predictions
    device : str, optional
        Device to use for computations ('cpu' or 'cuda'). Default is 'cpu'
    drgclis_path : str, optional
        Path to DRG scripts. If None, uses the global drgclis variable
        
    Returns:
    --------
    tuple
        (mean_values, columns, names) - averaged predictions and metadata
    """
    if drgclis_path is None:
        drgclis_path = drgclis
    
    # Generate predictions for each model
    for fold, model in modeldict.items():
        print(f'Processing {input_file} with model {model}')
        # Run model
        # keep track name files in that order because it's the same as during training.
        os.system(f'python {drgclis_path}train_models/run_cnn_model.py {input_file} None --predictnew --cnn {model}_model_params.dat device={device} --save_predictions --outname {outpath}')

    # Read in predictions from individual models and create new training set out of mean
    values_list = []
    prevcolumns = None
    prevnames = None

    for fold, model in modeldict.items():
        model_basename = os.path.basename(model)
        pred_file = f'{outpath}/from{model_basename}_predictions.npz'
        if os.path.exists(pred_file):
            with np.load(pred_file) as data:
                values = data['values']
                columns = data['columns']
                names = data['names']
                if len(values_list) == 0:
                    prevcolumns = columns
                    prevnames = names
                if np.array_equal(columns, prevcolumns) and np.array_equal(names, prevnames):
                    values_list.append(values)
                else:
                    print(f'Incompatible columns or names in {pred_file}. Skipping.')
        else:
            print(f'Prediction file {pred_file} not found.')

    if len(values_list) > 0:
        mean_values = np.mean(values_list, axis=0)
        return mean_values, prevcolumns, prevnames
    else:
        raise ValueError("No valid prediction files found.")

# Set up model dictionary
modeldict = {}
modelpath = os.path.abspath('./models/')
modelname = 'CTCFaH3K27acaH3K36me3aH33aH3K27me3aH3K4me1aATAConseq2krcomp_mh'
modelsuffix = '-cv10-1_Cormsek512l19TfEXPGELUmax10rcTvlCota_tc2dNoned1s1r1l7ma5nfc3s1024cbnoTfdo0.1tr1e-05SGD0.9bs64-F'

for f in range(10):
    modeldict['fold'+str(f)] = f'{modelpath}/{modelname}{f}{modelsuffix}'

# Set up paths and parameters
input_path = os.path.expanduser('./')
input_file = f'{input_path}seq2k.npz'
device = 'cpu'
outpath = os.path.expanduser('./output/')

# Generate ensemble predictions
mean_values, columns, names = generate_ensemble_predictions(
    modeldict, input_file, outpath, device=device, drgclis_path=drgclis
)

# Save the averaged predictions
np.savez_compressed(f'{outpath}/mean_predictions.npz', 
                   counts=mean_values, 
                   celltypes=columns, 
                   names=names)

### b. Generate new sequences from the genome and make predictions

In [None]:
# readin bed original bed file
data_path = '/home/sasse/UW/CutandRun/'
bed_file = f'{data_path}ImmGen_ATACpeak.final.bed6'
# Read bed and create novel 250bp region that are outside the 250 regions in the bed file
def determine_regions_between_peaks(bed_file):
    with open(bed_file, 'r') as f:
        peaks = [line.strip().split('\t') for line in f.readlines()]
    # Convert to intervals for each chromosome separately
    intervals = {}
    for peak in peaks:
        chrom = peak[0]
        start = int(peak[1])
        end = int(peak[2])
        if chrom not in intervals:
            intervals[chrom] = []
        intervals[chrom].append((start, end))
    # Find regions between peaks for each chromosome
    regions = {}
    for chrom, chrom_intervals in intervals.items():
        for i in range(len(chrom_intervals) - 1):
            end_current = chrom_intervals[i][1]
            start_next = chrom_intervals[i + 1][0]
            if start_next - end_current > 250:
                if chrom not in regions:
                    regions[chrom] = []
                regions[chrom].append((end_current, start_next))
    return regions




### c. Generate new sequences with variants in original sequences