# Code for Gillespie simulations

## Steps

- Input
  - Read the input matrix
  - create reactions
  - the update matrices
- Generate the steady state distribution
  - Every 300 time steps, give birth to a cell
  - simulate the cells in parallel


## Input


In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import re
import numba
from numba import njit
import json
import os
from datetime import datetime
import uuid
from tqdm.auto import tqdm
from joblib import Parallel, delayed
import random

In [2]:
# Read the matrix
def read_input_matrix(path_to_matrix):
    """
    Reads the input matrix from the specified file path and counts number of genes.
    
    Parameters:
    path_to_matrix (str): The file path to the input matrix.
    
    Returns:
    np.ndarray: The input matrix as a NumPy array.
    """
    matrix = np.loadtxt(path_to_matrix, dtype='i', delimiter=',')
    if matrix.ndim == 0:
        matrix = np.array([[matrix]])

    # print(type(matrix))
    # print(matrix.shape)
    return matrix.shape[0], matrix

In [3]:
def generate_reaction_network_from_matrix(interaction_matrix):
    """
    Generate a reaction DataFrame directly from a signed interaction matrix.
    Assumes gene-specific parameters and interaction-specific regulation.

    Args:
        interaction_matrix (np.ndarray): shape (n_genes, n_genes)

    Returns:
        pd.DataFrame: reactions
        List[str]: gene names
    """
    n_genes = interaction_matrix.shape[0]
    gene_list = [f"gene_{i+1}" for i in range(n_genes)]

    prop = {
        "regulatory": "(({sign}*{p_add})*({activator}_protein**{n})/({k}**{n} + {activator}_protein**{n}))*{target}_I",
        "activation": "{p_on}*{target}_I",
        "inactivation": "{p_off}*{target}_A",
        "mRNA_prod": "{p_prod_mRNA}*{target}_A",
        "mRNA_deg": "{p_deg_mRNA}*{target}_mRNA",
        "protein_prod": "{p_prod_protein}*{target}_mRNA",
        "protein_deg": "{p_deg_protein}*{target}_protein"
    }

    reactions = []

    for j, target_gene in enumerate(gene_list):
        param = lambda p: f"{{{p}_{target_gene}}}"

        # Activation (gene_I → gene_A)
        expr = prop["activation"]
        expr = expr.replace("{p_on}", param("p_on")).replace("{target}", target_gene)
        reactions.append({
            "species1": f"{target_gene}_A", "change1": 1,
            "species2": f"{target_gene}_I", "change2": -1,
            "propensity": expr, "time": "-"
        })

        # Regulation by other genes (column j)
        regulators = np.where(interaction_matrix[:, j] != 0)[0]
        for i in regulators:
            source_gene = gene_list[i]
            sign = int(np.sign(interaction_matrix[i, j]))
            edge_tag = f"{source_gene}_to_{target_gene}"

            expr = prop["regulatory"]
            expr = expr.replace("{sign}", str(sign))
            expr = expr.replace("{p_add}", f"{{p_add_{edge_tag}}}")
            expr = expr.replace("{n}", f"{{n_{edge_tag}}}")
            expr = expr.replace("{k}", f"{{k_{edge_tag}}}")
            expr = expr.replace("{activator}", source_gene)
            expr = expr.replace("{target}", target_gene)

            reactions.append({
                "species1": f"{target_gene}_A", "change1": 1,
                "species2": f"{target_gene}_I", "change2": -1,
                "propensity": expr, "time": "-"
            })

        # Inactivation (gene_A → gene_I)
        expr = prop["inactivation"]
        expr = expr.replace("{p_off}", param("p_off")).replace("{target}", target_gene)
        reactions.append({
            "species1": f"{target_gene}_I", "change1": 1,
            "species2": f"{target_gene}_A", "change2": -1,
            "propensity": expr, "time": "-"
        })

        # Transcription & translation (uses gene-specific params)
        for label, suffix, change in [
            ("mRNA_prod", "mRNA", 1),
            ("mRNA_deg", "mRNA", -1),
            ("protein_prod", "protein", 1),
            ("protein_deg", "protein", -1)
        ]:
            expr = prop[label].replace("{target}", target_gene)
            for p in ["d", "p_prod_mRNA", "p_deg_mRNA", "p_prod_protein", "p_deg_protein"]:
                expr = expr.replace(f"{{{p}}}", param(p))
            reactions.append({
                "species1": f"{target_gene}_{suffix}", "change1": change,
                "species2": "-", "change2": "-",
                "propensity": expr, "time": "-"
            })

    # Consolidate reactions with same species1/species2/change values
    df = pd.DataFrame(reactions)
    df['propensity'] = df['propensity'].astype(str)
    reactions_df = (
        df.groupby(['species1', 'change1', 'species2', 'change2', 'time'])['propensity']
          .agg(lambda x: ' + '.join(x))
          .reset_index()
    )
    return reactions_df, gene_list

def generate_initial_state_from_genes(gene_list):
    """
    Generate an initial state where all genes are inactive and have zero mRNA/protein.

    Returns:
        pd.DataFrame with columns ['species', 'count']
    """
    states = []
    for gene in gene_list:
        states.extend([
            {"species": f"{gene}_A", "count": 0},
            {"species": f"{gene}_I", "count": 1},
            {"species": f"{gene}_mRNA", "count": 0},
            {"species": f"{gene}_protein", "count": 0}
        ])
    return pd.DataFrame(states)


In [4]:
def assign_parameters_to_genes(csv_path, n_genes, rows=None):
    """
    Assigns parameters from CSV to genes and returns a param_dict for expression substitution.
    
    Args:
        csv_path (str): Path to parameter CSV file
        rows (list of int, optional): Specific row indices to select. If None, selects randomly.
        n_random (int): Number of random rows to select if rows is None.
    
    Returns:
        tuple:
            param_dict (dict): {"{param_geneX}": value}
            param_matrix (pd.DataFrame): gene-wise parameter values
            row_mapping (dict): {"gene_X": row_index}
    """
    df = pd.read_csv(csv_path, index_col=0)

    # Select rows to assign to genes
    if rows is None:
        rows = np.random.choice(df.index, size=n_genes, replace=True)

    param_dict = {}
    param_matrix = {}
    row_mapping = {}

    for i, row in enumerate(rows):
        gene = f"gene_{i+1}"
        values = df.loc[row].copy()
        row_mapping[gene] = row

        # Derived params
        values["p_deg_mRNA"] = np.log(2) / values["mrna_half_life"]
        values["p_deg_protein"] = np.log(2) / values["protein_half_life"]

        # Remove unused columns
        values.drop(["mrna_half_life", "protein_half_life", "burst_size"], inplace=True, errors="ignore")

        # Add to param_matrix
        param_matrix[gene] = values

        # Flatten into param_dict with curly-brace keys
        for param, val in values.items():
            param_dict[f"{{{param}_{gene}}}"] = val

    param_matrix_df = pd.DataFrame(param_matrix).T
    return param_dict, param_matrix_df


In [5]:
# An useful utility function that is not used for simulation
# def calculate_unregulated_protein_levels(p_on, p_off, p_prod_mRNA, p_deg_mRNA, p_prod_protein, p_deg_protein, global_params=None):
#     """
#     Calculate protein levels without any regulation (for comparison/initialization).
    
#     Args:
#         param_matrix (pd.DataFrame): Gene parameters
#         global_params (dict, optional): Global constants
        
#     Returns:
#         np.ndarray: Unregulated steady-state protein levels
#     """    
#     # Calculate unregulated levels
#     burst_prob = p_on / (p_on + p_off)
#     mRNA = p_prod_mRNA * burst_prob / (p_deg_mRNA)
#     protein_levels = mRNA * p_prod_protein / p_deg_protein
    # return protein_levels

def generate_k_from_steady_state_calc(param_dict, interaction_matrix, gene_list,
                                        global_params=None, target_hill=0.5, scale_k=None, verbose=False):
    """
    Calculate steady-state protein levels using regulated k_on_eff for each gene,
    and update EC50 (k_*) values only for actual regulatory edges.

    Args:
        param_dict (dict): Parameter dictionary with gene-specific and edge-specific entries.
        interaction_matrix (np.ndarray): shape (n_genes, n_genes), effect of gene i on gene j.
        gene_list (list): List of gene names in order.
        global_params (dict): Optional global constants (unused).
        target_hill (float): Hill output to match when computing EC50.
        scale_k (np.ndarray or None): shape (n_genes, n_genes). scale_k[i, j] applies to k_{i→j}.
                                      Defaults to 1.0 for all entries.
        verbose (bool): Print debug info.

    Returns:
        tuple: (np.ndarray of steady-state protein levels, updated param_dict with k_* entries)
    """
    n_genes = len(gene_list)

    if scale_k is None:
        scale_k = np.ones((n_genes, n_genes))
    else:
        scale_k = np.asarray(scale_k)
        assert scale_k.shape == (n_genes, n_genes), "scale_k must be of shape (n_genes, n_genes)"

    protein_levels = np.zeros(n_genes)
    p_on_eff = np.zeros(n_genes)

    for i in range(n_genes):
        gene = gene_list[i]
        p_on = param_dict[f'{{p_on_{gene}}}']
        p_off = param_dict[f'{{p_off_{gene}}}']
        p_prod_mRNA = param_dict[f'{{p_prod_mRNA_{gene}}}']
        p_deg_mRNA = param_dict[f'{{p_deg_mRNA_{gene}}}']
        p_prod_protein = param_dict[f'{{p_prod_protein_{gene}}}']
        p_deg_protein = param_dict[f'{{p_deg_protein_{gene}}}']


        # Sum regulatory contributions
        regulatory_effect = 0.0
        regulators = np.where(interaction_matrix[:, i] != 0)[0]

        for reg in regulators:
            source = gene_list[reg]
            edge = f"{source}_to_{gene}"
            print(param_dict)
            p_add = param_dict.get(f"{{p_add_{edge}}}", 0)
            sign = interaction_matrix[reg, i]
            regulatory_effect += target_hill * p_add * sign
            print(source, edge, p_add, sign, regulatory_effect)

        p_on_eff[i] = p_on + regulatory_effect

        # # Compute protein level using k_on_eff
        burst_prob = p_on_eff[i] / (p_on_eff[i] + p_off)
        mRNA = p_prod_mRNA * burst_prob / p_deg_mRNA
        protein = mRNA * p_prod_protein / p_deg_protein

        protein_levels[i] = max(protein, 0.1)

        if verbose:
            print(f"Gene {gene}: k_on = {k_on:.3f} → k_on_eff = {k_on_eff[i]:.3f} "
                  f"(reg_effect: {regulatory_effect:.3f}) → Protein level: {protein_levels[i]:.3f}")

        # Assign EC50 values (k_*) only for actual edges, scaled with scale_k[i, j]
    for i in range(n_genes):
        source_gene = gene_list[i]
        targets = np.where(interaction_matrix[i, :] != 0)[0]
        for j in targets:
            target_gene = gene_list[j]
            key = f"{{k_{source_gene}_to_{target_gene}}}"
            param_dict[key] = protein_levels[i] * scale_k[i, j]

    return protein_levels, param_dict



In [6]:
def add_interaction_terms(param_dict, interaction_matrix, gene_list, n_matrix=None, p_add_matrix=None):
    """
    Add n and p_add terms to param_dict based on interaction_matrix.
    Also calculates EC50 (k) values using steady-state protein levels.

    Args:
        param_dict (dict): Initial dictionary of gene-specific parameters.
        interaction_matrix (np.ndarray): Regulatory interactions (2D array).
        gene_list (list): List of gene names like ['gene_1', 'gene_2', ...].
        n_matrix (np.ndarray, optional): Matrix of Hill coefficients (defaults to 2).
        p_add_matrix (np.ndarray, optional): Matrix of r_add values (defaults to 10).

    Returns:
        dict: Updated param_dict including n, p_add, and k values.
    """
    interaction_matrix = np.array(interaction_matrix)
    n_genes = len(gene_list)
    param_dict_updated = param_dict.copy()
    if n_matrix is None:
        n_matrix = np.full((n_genes, n_genes), 2)
    if p_add_matrix is None:
        p_add_matrix = np.full((n_genes, n_genes), 10)

    for i in range(n_genes):
        for j in range(n_genes):
            if interaction_matrix[i, j] != 0:
                gene_i = gene_list[i]
                gene_j = gene_list[j]
                edge = f"{gene_i}_to_{gene_j}"

                param_dict_updated[f"{{n_{edge}}}"] = n_matrix[i, j]
                param_dict_updated[f"{{p_add_{edge}}}"] = p_add_matrix[i, j]
    # Generate EC50 values using the correct steady-state calculation
    protein_levels, k_dict = generate_k_from_steady_state_calc(param_dict_updated, interaction_matrix, gene_list)

    return protein_levels, k_dict


In [7]:
def setup_gillespie_params_from_reactions(init_states, reactions, param_dictionary):
    """
    Setup Gillespie update matrix and function using species and parameter templates.

    Args:
        init_states (pd.DataFrame): columns ['species', 'count']
        reactions (pd.DataFrame): columns ['species1', 'change1', 'species2', 'change2', 'propensity', 'time']
        param_dictionary (dict): Dictionary containing all parameters (e.g., p_on_*, k_*, p_add_*, ...)

    Returns:
        population_init (np.ndarray), update_matrix (np.ndarray), update_function (str), species_index (dict)

    Raises:
        ValueError: If any required placeholder in reaction propensities is not in param_dictionary.
    """
    init_states = init_states.dropna()
    reactions = reactions.dropna()

    species_index = {s: i for i, s in enumerate(init_states['species'])}
    population_init = init_states['count'].values.astype(np.int64)
    
    update_matrix = []
    propensity_formulas = []
    missing_keys_report = []

    for i, row in reactions.iterrows():
        delta = [0] * len(species_index)
        delta[species_index[row['species1']]] = int(row['change1'])
        if row['species2'] != '-':
            delta[species_index[row['species2']]] = int(row['change2'])
        update_matrix.append(delta)

        expr = row['propensity']

        # Replace species with indexed population
        for species, idx in species_index.items():
            expr = expr.replace(species, f"population[{idx}]")

        # Validate and inject all parameter placeholders
        placeholders = set(re.findall(r"{[^}]+}", expr))
        missing = placeholders - set(param_dictionary.keys())
        if missing:
            missing_keys_report.append((i, row['propensity'], list(missing)))
            continue  # move to next reaction without injecting

        for key, val in param_dictionary.items():
            expr = expr.replace(key, str(val))

        if row['time'] != '-':
            line = f"propensities[{i}] = ({expr}) if ({row['time']}) else 0"
        else:
            line = f"propensities[{i}] = {expr}"
        propensity_formulas.append(line)

    if missing_keys_report:
        error_message = "Missing parameters in propensity expressions:\n"
        for i, raw_expr, missing_keys in missing_keys_report:
            error_message += f"  [Reaction {i}] '{raw_expr}' is missing: {', '.join(missing_keys)}\n"
        raise ValueError(error_message)

    update_func = "@numba.njit(fastmath=True)\ndef update_propensities(propensities, population, t):\n\t" + "\n\t".join(propensity_formulas)

    return population_init, np.array(update_matrix, dtype=np.int64), update_func, species_index


In [22]:
@njit(fastmath = True)
def sample_discrete(probs): #samples which reaction to run next
    """Randomly sample an index with probability given by probs."""
    # Generate random number
    q = np.random.rand()

    # Find index
    i = 0
    p_sum = 0.0
    while p_sum < q:
        p_sum += probs[i]
        i += 1
    return i - 1


@njit(fastmath = True)
def gillespie_draw(propensity_func, propensities, population, t):
    """
    Draws a reaction and the time it took to do that reaction.

    Parameters
    ----------
    propensity_func : function
        Function with call signature propensity_func(population, t, *args)
        used for computing propensities. This function must return
        an array of propensities.
    propensities : ndarray
        Propensities for each reaction as a 1D Numpy array.
    population : ndarray
        Current population of particles (key entities of interest, ie TF(RNA), TF(P))
    t : float
        Value of the current time.
    args : tuple, default ()
        Arguments to be passed to `propensity_func`.

    Returns
    -------
    rxn : int
        Index of reaction that occured.
    time : float
        Time it took for the reaction to occur.
    """
    # Compute propensities
    propensity_func(propensities, population, t)

    # Sum of propensities
    props_sum = propensities.sum()

    # Compute next time
    time = np.random.exponential(1.0 / props_sum) ## exponentially distributed state-switching

    # Compute discrete probabilities of each reaction
    rxn_probs = propensities / props_sum

    # Draw reaction from this distribution
    rxn = sample_discrete(rxn_probs) #sample_discrete(rxn_probs) or sample_discrete_scipy(rxn_probs)

    return rxn, time

def gillespie_ssa(propensity_func, update, population_0, time_points):
    """
    Uses the Gillespie stochastic simulation algorithm to sample
    from probability distribution of particle counts over time.

    Parameters
    ----------
    propensity_func : function
        Function of the form f(params, t, population) that takes the current
        population of particle counts and returns an array of propensities
        for each reaction.
    update : ndarray, shape (num_reactions, num_chemical_species)
        Entry i, j gives the change in particle counts of species j
        for chemical reaction i.
    population_0 : array_like, shape (num_chemical_species)
        Array of initial populations of all chemical species.
    time_points : array_like, shape (num_time_points,)
        Array of points in time for which to sample the probability
        distribution.
    args : tuple, default ()
        The set of parameters to be passed to propensity_func.

    Returns
    -------
    sample : ndarray, shape (num_time_points, num_chemical_species)
        Entry i, j is the count of chemical species j at time
        time_points[i].
    """

    # Initialize output
    pop_out = np.empty((len(time_points), update.shape[1]), dtype=np.int64)

    # Initialize and perform simulation
    i_time = 1
    i = 0
    t = time_points[0]
    population = population_0.copy()
    pop_out[0, :] = population
    propensities = np.zeros(update.shape[0])
    while i < len(time_points):
        while t < time_points[i_time]:
            # draw the event and time step
            event, dt = gillespie_draw(propensity_func, propensities, population, t)

            # Update the population
            population_previous = population.copy()
            population += update[event, :]

            # Increment time
            t += dt

        # Update the index
        # Replace inefficient comparison
        i = np.searchsorted(time_points, t)


        # Update the population
        pop_out[i_time : min(i, len(time_points))] = population_previous

        # Increment index
        i_time = i

    return pop_out


In [9]:
def extract_mrna_protein(samples, species_index, types=('mRNA', 'protein')):
    """
    Extract mRNA/protein data from a 3D samples array: (n_cells, n_timepoints, n_species)
    
    Args:
        samples (np.ndarray): shape (n_cells, n_timepoints, n_species)
        species_index (dict): species name → index
        types (tuple): which species types to include
    Returns:
        pd.DataFrame
    """
    n_cells, n_timepoints, _ = samples.shape
    selected_species = {k: v for k, v in species_index.items() if any(k.endswith(t) for t in types)}
    
    records = []
    for cell in range(n_cells):
        for tp in range(n_timepoints):
            row = {'cell_id': cell, 'time_step': tp}
            for species, idx in selected_species.items():
                row[species] = samples[cell, tp, idx]
            records.append(row)
    
    return pd.DataFrame.from_records(records)


## Steady state test calculations


In [10]:
# def grn_ode(t, y, params, p_on_eff=None):
#     # Unpack species
#     gene_1_A, gene_1_I, mRNA_1, protein_1, gene_2_A, gene_2_I, mRNA_2, protein_2 = y

#     # Unpack parameters
#     p_on_1 = params['{p_on_gene_1}']
#     p_off_1 = params['{p_off_gene_1}']
#     p_prod_mRNA_1 = params['{p_prod_mRNA_gene_1}']
#     p_deg_mRNA_1 = params['{p_deg_mRNA_gene_1}']
#     p_prod_protein_1 = params['{p_prod_protein_gene_1}']
#     p_deg_protein_1 = params['{p_deg_protein_gene_1}']

#     p_on_2 = params['{p_on_gene_2}']
#     p_off_2 = params['{p_off_gene_2}']
#     p_prod_mRNA_2 = params['{p_prod_mRNA_gene_2}']
#     p_deg_mRNA_2 = params['{p_deg_mRNA_gene_2}']
#     p_prod_protein_2 = params['{p_prod_protein_gene_2}']
#     p_deg_protein_2 = params['{p_deg_protein_gene_2}']

#     r_add = params['{p_add_gene_1_to_gene_2}']
#     n = params['{n_gene_1_to_gene_2}']
#     k = params['{k_gene_1_to_gene_2}']

#     # Hill function (if p_on_eff is not provided)
#     if p_on_eff is None:
#         hill = (r_add * (protein_1**n)) / (k**n + protein_1**n)
#         p_on2_eff = p_on_2 + hill
#     else:
#         p_on2_eff = p_on_eff

#     # ODEs
#     d_gene_1_A = -p_off_1 * gene_1_A + p_on_1 * gene_1_I
#     d_gene_1_I = -d_gene_1_A

#     d_mRNA_1 = p_prod_mRNA_1 * gene_1_A - p_deg_mRNA_1 * mRNA_1
#     d_protein_1 = p_prod_protein_1 * mRNA_1 - p_deg_protein_1 * protein_1

#     d_gene_2_A = -p_off_2 * gene_2_A + p_on2_eff * gene_2_I
#     d_gene_2_I = -d_gene_2_A

#     d_mRNA_2 = p_prod_mRNA_2 * gene_2_A - p_deg_mRNA_2 * mRNA_2
#     d_protein_2 = p_prod_protein_2 * mRNA_2 - p_deg_protein_2 * protein_2

#     return [
#         d_gene_1_A, d_gene_1_I, d_mRNA_1, d_protein_1,
#         d_gene_2_A, d_gene_2_I, d_mRNA_2, d_protein_2
#     ]

# def compute_effective_kon(df, param_dict):
#     protein1 = df['gene_1_protein'].values
#     n = float(param_dict['{n_gene_1_to_gene_2}'])
#     k = float(param_dict['{k_gene_1_to_gene_2}'])
#     p_add = float(param_dict['{p_add_gene_1_to_gene_2}'])
#     p_on = float(param_dict['{p_on_gene_2}'])

#     hill_vals = (protein1 ** n) / (k**n + protein1 ** n)
#     return p_on + p_add * np.mean(hill_vals)



# from scipy.integrate import solve_ivp

# # Example parameters

# # Initial conditions: all zeros except one active burst
# y0 = population_0 # [A1, I1, m1, p1, A2, I2, m2, p2]

# sim_Gillespie_10000_cells = pd.read_csv('/home/mzo5929/Keerthana/grnInference/simulation_data/gillespie_simulation/test/df_simulation_10000_cells_300h_timepoints_rows_12_13_20250701_213952_e87c9e5a.csv')
# time = 299
# singleTime_df_gillespie_10000 = sim_Gillespie_10000_cells[(sim_Gillespie_10000_cells['time_step'] == time)]
# p_on_eff_gene_2 = compute_effective_kon(singleTime_df_gillespie_10000, full_param_dict)
# protein1_samples = singleTime_df_gillespie_10000['gene_1_protein'].values
# K = full_param_dict['{k_gene_1_to_gene_2}']
# n = full_param_dict['{n_gene_1_to_gene_2}']
# hill_vals = (protein1_samples ** n) / (K**n + protein1_samples ** n)
# mean_hill = np.mean(hill_vals)
# print(f"Expected Hill output from simulation: {mean_hill:.4f}")
# hill_at_mean = (np.mean(protein1_samples) ** n) / (K**n + np.mean(protein1_samples) ** n)
# print(f"Hill(mean protein1): {hill_at_mean:.4f}")


# # from functools import partial
# # from scipy.integrate import solve_ivp

# # wrapped_ode = partial(grn_ode, params=full_param_dict, p_on_eff=p_on_eff_gene_2)
# # sol = solve_ivp(wrapped_ode, [0, 1000], y0, method='RK45', dense_output=True)


## Generate initial conditions from the low-res simulation


In [11]:
def generate_initial_condition(path_to_low_res_sim, species_index):
    sim = pd.read_csv(path_to_low_res_sim)
    max_time = sim['timestep'].max()
    final_snapshot = sim[sim['timestep'] == max_time]

    # Rename columns to match species_index
    sim = final_snapshot.rename(columns={
        "gene_1_is_bursting": "gene_1_A",
        "gene_1_unspliced_mRNA": "gene_1_mRNA",
        "gene_2_is_bursting": "gene_2_A",
        "gene_2_unspliced_mRNA": "gene_2_mRNA"
    })

    init_conditions = []

    for _, row in sim.iterrows():
        state = np.zeros(len(species_index))

        for name, idx in species_index.items():
            if name.endswith("_I"):  # Infer inactive state as 1 - active
                active_name = name.replace("_I", "_A")
                state[idx] = 1 - row[active_name]
            else:
                state[idx] = row[name]

        init_conditions.append(state)

    return init_conditions


## Running the code


In [12]:
path_to_matrix = "/home/mzo5929/Keerthana/grnInference/simulation_data/general_simulation_data/test_data/matrix101.txt"
n_genes, interaction_matrix = read_input_matrix(path_to_matrix)
interaction_matrix

array([[0, 1],
       [0, 0]], dtype=int32)

In [13]:
reactions_df, gene_list = generate_reaction_network_from_matrix(interaction_matrix=interaction_matrix)

In [14]:
init_states = generate_initial_state_from_genes(gene_list=gene_list)

In [15]:
param_path = "/home/mzo5929/Keerthana/grnInference/simulation_data/general_simulation_data/test_data/parameter_sheet_gillespie.csv"
rows = [12, 13]
param_dict, param_df = assign_parameters_to_genes(param_path, gene_list, rows)

In [16]:
p_add_matrix = np.array([
    [0.0, 2.0],  # gene_1 effects
    [0, 0]  # gene_2 effects
])

n_matrix = np.array([
    [0.0, 4.4118197399108965],  # gene_1 effects
    [0, 0]  # gene_2 effects
])

steady_state, full_param_dict = add_interaction_terms(param_dict, interaction_matrix, gene_list, p_add_matrix=p_add_matrix, n_matrix = n_matrix)

{'{p_on_gene_1}': 0.4023188424412684, '{p_off_gene_1}': 69.57850258396365, '{p_prod_protein_gene_1}': 0.0297124846722611, '{p_prod_mRNA_gene_1}': 2080.5819902587614, '{p_deg_mRNA_gene_1}': 0.1395319074418133, '{p_deg_protein_gene_1}': 0.03957606697815643, '{p_on_gene_2}': 0.2127851341068382, '{p_off_gene_2}': 31.93368390579621, '{p_prod_protein_gene_2}': 0.1460875043611262, '{p_prod_mRNA_gene_2}': 2531.8584037063533, '{p_deg_mRNA_gene_2}': 0.12492856661168084, '{p_deg_protein_gene_2}': 0.007806136040099727, '{n_gene_1_to_gene_2}': 4.4118197399108965, '{p_add_gene_1_to_gene_2}': 2.0}
gene_1 gene_1_to_gene_2 2.0 1 1.0


In [17]:
full_param_dict

{'{p_on_gene_1}': 0.4023188424412684,
 '{p_off_gene_1}': 69.57850258396365,
 '{p_prod_protein_gene_1}': 0.0297124846722611,
 '{p_prod_mRNA_gene_1}': 2080.5819902587614,
 '{p_deg_mRNA_gene_1}': 0.1395319074418133,
 '{p_deg_protein_gene_1}': 0.03957606697815643,
 '{p_on_gene_2}': 0.2127851341068382,
 '{p_off_gene_2}': 31.93368390579621,
 '{p_prod_protein_gene_2}': 0.1460875043611262,
 '{p_prod_mRNA_gene_2}': 2531.8584037063533,
 '{p_deg_mRNA_gene_2}': 0.12492856661168084,
 '{p_deg_protein_gene_2}': 0.007806136040099727,
 '{n_gene_1_to_gene_2}': 4.4118197399108965,
 '{p_add_gene_1_to_gene_2}': 2.0,
 '{k_gene_1_to_gene_2}': 64.35895431409467}

In [18]:
population_0, update_matrix, update_propensity_func_string, species_index = setup_gillespie_params_from_reactions(init_states, reactions_df, full_param_dict)
exec(update_propensity_func_string)

In [19]:
path_to_low_res_sim = '/home/mzo5929/Keerthana/grnInference/simulation_data/general_simulation_data/test/simulation_matrix101_A_B_12_13_fifteen_second_step_no_splicing_500h.csv'
population_init_list = generate_initial_condition(path_to_low_res_sim, species_index)

In [20]:
df = pd.read_csv(path_to_low_res_sim)
df['cell_id'].unique()

array([   0,    1,    2, ..., 1997, 1998, 1999])

In [23]:
# Seed random number generator for reproducibility
np.random.seed(42)
n_gene = len(gene_list)
n_cells = 20000
time_duration = 300 #in hours
time_points = np.arange(0, time_duration, 1)
# Initialize output array
# samples = np.empty((n_cells, len(time_points), n_gene*4), dtype=int)

# Run the calculations
# for i in tqdm(range(n_cells)):
#     samples[i, :, :] = gillespie_ssa(update_propensities, update_matrix, population_0, time_points)
    


# Your original variables
# samples: shape (n_cells, len(time_points), num_species)
samples = np.zeros((n_cells, len(time_points), population_0.shape[0]))

# def run_simulation(i):
#     return gillespie_ssa(update_propensities, update_matrix, population_0, time_points)

# # Run in parallel with tqdm
# results = Parallel(n_jobs=5)(
#     delayed(run_simulation)(i) for i in tqdm(range(n_cells))
# )
# # # Store results into the samples array
# for i, res in enumerate(results):
#     samples[i, :, :] = res

def run_simulation(i, pop0):
    return gillespie_ssa(update_propensities, update_matrix, pop0, time_points)

print(f"population_init_list length: {len(population_init_list)}, n_cells: {n_cells}")
if len(population_init_list) > n_cells:
    population_init_list = random.sample(population_init_list, n_cells)
print(f"population_init_list length: {len(population_init_list)}, n_cells: {n_cells}")

results = Parallel(n_jobs=8)(
    delayed(run_simulation)(i, pop0) for i, pop0 in tqdm(enumerate(population_init_list), total=n_cells)
)

for i, res in enumerate(results):
    samples[i, :, :] = res

all_samples = extract_mrna_protein(samples, species_index)


#code to reshape 3D to 2D and output + input
samples_reshaped = samples.reshape(samples.shape[0], -1)

# === Setup ===
now = datetime.now()
timestamp_str = now.strftime("%Y%m%d_%H%M%S")
short_id = uuid.uuid4().hex[:8]
row_str = "_".join(map(str, rows))
prefix = f"steady_state_init_15s_after_500h_{n_cells}_cells_{time_duration}h_timepoints_rows_{row_str}_{timestamp_str}_{short_id}"

base_path = "/home/mzo5929/Keerthana/grnInference/simulation_data/gillespie_simulation/test"
jsonl_path = os.path.join(base_path, "simulation_metadata.jsonl")  # single file to append to

# === File paths ===
df_path = os.path.join(base_path, f"df_simulation_{prefix}.csv")
samples_path = os.path.join(base_path, f"samples_simulation_{prefix}.csv")

# === Save reshaped samples ===
np.savetxt(samples_path, samples_reshaped, delimiter=",")

# === Save DataFrame ===
all_samples.to_csv(df_path, index=False)

# === Prepare metadata record ===
record = {
    "id": short_id,
    "timestamp": now.isoformat(),
    "param_dict": param_dict,
    "interaction_matrix": interaction_matrix.tolist(),
    "steady_state": steady_state.tolist() if hasattr(steady_state, "tolist") else steady_state,
    "df_path": df_path,
    "samples_path": samples_path
}

# === Append to JSONL file ===
with open(jsonl_path, 'a') as f:
    f.write(json.dumps(record) + '\n')

population_init_list length: 10000, n_cells: 20000
population_init_list length: 10000, n_cells: 20000


  0%|          | 0/20000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [51]:
df_path

'/home/mzo5929/Keerthana/grnInference/simulation_data/gillespie_simulation/test/df_simulation_steady_state_init_15s_after_500h_10000_cells_200h_timepoints_rows_12_13_20250704_171325_7b660e28.csv'