# Generador de secuencias con k-meros predefinidos

In [1]:
import random
import numpy as np
import os, time

# Step 1: Define the character set
def get_character_set(use_strings=False, alphabet="ACGT"):
    """
    Returns the set of allowed characters or strings:
    - If use_strings is False: A, C, G, T.
    - If use_strings is True: Predefined strings (e.g., codons).
    """
    if use_strings:
        
        return alphabet
    else:
        return "ACGT"

# Step 2: Generate random characters or strings with different distributions
def generate_random_content(character_set, length, distribution="uniform", params=None):
    """
    Generates a random sequence of a given length from the provided character set.
    Supports uniform, weighted, normal, and custom distributions.
    
    distribution: "uniform" | "weighted" | "normal" | "custom"
    params: Optional parameters for the distribution.
    """
    num_elements = len(character_set)
    
    if distribution == "uniform":
        # Uniform: Equal probability for all elements
        probabilities = [1 / num_elements] * num_elements
    
    elif distribution == "weighted":
        # Weighted: Use weights provided in params
        if not params or "weights" not in params:
            raise ValueError("Weights must be provided for 'weighted' distribution.")
        probabilities = np.array(params["weights"], dtype=float)
        probabilities /= probabilities.sum()  # Normalize to sum to 1

    elif distribution == "normal":
        # Normal distribution: Centered probabilities
        if not params or "mean" not in params or "std" not in params:
            raise ValueError("Mean and standard deviation must be provided for 'normal' distribution.")
        mean = params["mean"]
        std = params["std"]
        x = np.linspace(0, num_elements - 1, num_elements)
        probabilities = np.exp(-0.5 * ((x - mean) / std) ** 2)
        probabilities /= probabilities.sum()  # Normalize to sum to 1

    elif distribution == "custom":
        # Custom probabilities: Provided explicitly
        if not params or "probabilities" not in params:
            raise ValueError("Explicit probabilities must be provided for 'custom' distribution.")
        probabilities = np.array(params["probabilities"], dtype=float)
        probabilities /= probabilities.sum()  # Normalize to sum to 1

    else:
        raise ValueError(f"Unsupported distribution type: {distribution}")

    # Generate random elements based on probabilities
    indices = np.random.choice(len(character_set), size=length, p=probabilities)
    return ''.join(character_set[i] for i in indices) if isinstance(character_set[0], str) else ''.join(character_set[i] for i in indices)

# Step 3: Save characters to a FASTA file
def save_to_fasta(content, filename, output_dir):
    """
    Saves the provided content to a FASTA file.
    Creates the directory if it doesn't exist.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    file_path = os.path.join(output_dir, filename)
    
    # Write the FASTA file format
    with open(file_path, 'w') as file:
        file.write(f">sequence_1\n")  # Header for the sequence (you can adjust the name)
        # Split content into lines of 80 characters (optional formatting)
        for i in range(0, len(content), 80):
            file.write(content[i:i+80] + "\n")
    
    print(f"FASTA file saved at: {file_path}")
    return file_path

# Step 4: Pipeline function
def random_fasta_pipeline(length=100, filename="random_dna_sequence.fasta", distribution="uniform", use_strings=False, alphabet="ACGT", params=None):
    """
    Generates and saves a FASTA file with random characters or strings.
    Allows for custom distributions and string-based character sets.
    """
    character_set = get_character_set(use_strings,alphabet)
    random_content = generate_random_content(character_set, length, distribution, params)
    output_dir = "/files/Mariel/Tesis_Mariel/data/sequences/simulated_seq/based_on_real"
    save_to_fasta(random_content, filename, output_dir)


In [None]:
%%time
# Ejemplo
# Neisseria_gonorrhoeae data
neiss_k4_k3_k2_k1_meros = ['G','C','A','T','CG','GC','AA','TT','AAA','TTT','CGG','CCG']
# "weights": [0.262570,0.261989,0.239393,0.236048,0.091148,0.087841,0.085057,0.082733,0.032736,0.031258,0.028079,0.027994]

if __name__ == "__main__":
    # Uniform distribution, single characters
    #random_fasta_pipeline(length=100000, filename="dna_pseudo_4_3_2_mers_uniform.fasta", distribution="uniform", use_strings=True, alphabet=pseudo_k4_k3_k2_meros) # , params = {"mean":100, "std":23}
    
    # Weighted distribution, predefined strings
    random_fasta_pipeline(length=2000000, filename="dna_neiss_k4_k3_k2_k1_meros_weighted.fasta", distribution="weighted", use_strings=True, alphabet=neiss_k4_k3_k2_k1_meros, params={"weights": [0.262570,0.261989,0.239393,0.236048,0.091148,0.087841,0.085057,0.082733,0.032736,0.031258,0.028079,0.027994]})
    
    # Custom probabilities, predefined strings
    #random_fasta_pipeline(length=50, filename="custom_codons.fasta", distribution="custom", use_strings=True, params={"probabilities": [0.25, 0.25, 0.4, 0.1]})
