In [2]:

%%writefile Dotplot.py 
import argparse
import multiprocessing
import matplotlib.pyplot as plt
import numpy as np
from Bio import SeqIO

def merge_sequences_from_fasta(file_path):
    sequences = [str(record.seq) for record in SeqIO.parse(file_path, "fasta")]
    return "".join(sequences)

def generate_dotplot_seq(seq1, seq2):
    matrix = (seq1[:, None] == seq2).astype(np.int8)

    plt.imshow(matrix, cmap='Greys', aspect='auto', extent=[0, len(seq2), len(seq1), 0], origin='upper')
    plt.xlabel('Sequence 2')
    plt.ylabel('Sequence 1')
    plt.show()

def generate_dotplot_multiprocessing(seq1, seq2, num_processes):
    len_seq1 = len(seq1)
    len_seq2 = len(seq2)
    dotplot_matrix = np.zeros((len_seq1, len_seq2), dtype=bool)

    def worker(start, end):
        for i in range(start, end):
            for j in range(len_seq2):
                if seq1[i] == seq2[j]:
                    dotplot_matrix[i, j] = 1

    processes = []
    block_size = len_seq1 // num_processes

    for i in range(num_processes):
        start = i * block_size
        end = start + block_size if i < num_processes - 1 else len_seq1
        p = multiprocessing.Process(target=worker, args=(start, end))
        processes.append(p)
        p.start()

    for p in processes:
        p.join()

    generate_dotplot_seq(seq1, seq2)

def main():
    parser = argparse.ArgumentParser(description="Dotplot Generator")
    parser.add_argument("file1", help="Path to the first FASTA file")
    parser.add_argument("file2", help="Path to the second FASTA file")
    parser.add_argument("--multiprocessing", action="store_true", help="Use multiprocessing")

    args = parser.parse_args()

    seq1 = np.fromiter(merge_sequences_from_fasta(args.file1), dtype=np.dtype('U1'))
    seq2 = np.fromiter(merge_sequences_from_fasta(args.file2), dtype=np.dtype('U1'))

    if args.multiprocessing:
        num_processes = multiprocessing.cpu_count()
        generate_dotplot_multiprocessing(seq1, seq2, num_processes)
    else:
        generate_dotplot_seq(seq1, seq2)

if __name__ == "__main__":
    main()


Writing Dotplot.py


In [1]:
!python Dotplot.py Data\Chimpanzee.fa Data\Homosapiens.fa --multiprocessing

Traceback (most recent call last):
  File "Dotplot.py", line 63, in <module>
    main()
  File "Dotplot.py", line 58, in main
    generate_dotplot_multiprocessing(seq1, seq2, num_processes)
  File "Dotplot.py", line 38, in generate_dotplot_multiprocessing
    p.start()
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\multiprocessing\process.py", line 121, in start
    self._popen = self._Popen(self)
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\multiprocessing\context.py", line 224, in _Popen
    return _default_context.get_context().Process._Popen(process_obj)
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\multiprocessing\context.py", line 327, in _Popen
    return Popen(process_obj)
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\multiprocessing\pope

In [9]:
import numpy as np
import multiprocessing
import matplotlib.pyplot as plt

def read_fasta_file(fasta_file):
    sequences = []
    with open(fasta_file, "r") as f:
        for line in f:
            if line.startswith(">"):
                continue
            sequences.append(line.strip())
    return sequences

def worker(sequence_pair, dotplot_matrix):
    sequence1, sequence2 = sequence_pair
    result = np.fromiter((x == sequence2 for x in sequence1), dtype=np.int8)
    dotplot_matrix += result

def make_dotplot(species1_sequences, species2_sequences):
    length1 = len(species1_sequences[0])
    length2 = len(species2_sequences[0])
    manager = multiprocessing.Manager()
    dotplot_matrix = manager.list(np.zeros((length1, length2)))
    sequence_pairs = [(sequence1, sequence2) for sequence1, sequence2 in zip(species1_sequences, species2_sequences)]
    
    with multiprocessing.Pool() as pool:
        pool.starmap(worker, zip(sequence_pairs, [dotplot_matrix] * len(sequence_pairs)))
    
    dotplot_matrix = np.array(dotplot_matrix).reshape(length1, length2)
    
    return dotplot_matrix

def main(file1, file2, output_file):
    species1_sequences = read_fasta_file(file1)
    species2_sequences = read_fasta_file(file2)
    dotplot_matrix = make_dotplot(species1_sequences, species2_sequences)
    np.savetxt(output_file, dotplot_matrix, fmt="%d")

# Rutas de los archivos FASTA
file1 = "Data/Chimpanzee.fa"
file2 = "Data/Homosapiens.fa"

# Archivo de salida para el dotplot
output_file = "Data/dotplot.txt"

# Llamada a la función principal
main(file1, file2, output_file)

In [3]:
import numpy as np
import multiprocessing
from Bio import SeqIO

def read_fasta_file(fasta_file):
    # Lee un archivo FASTA y devuelve una lista de secuencias
    sequences = []
    with open(fasta_file, "r") as f:
        for record in SeqIO.parse(f, "fasta"):
            sequences.append(str(record.seq))
    
    return sequences

def worker(sequence_pair):
    sequence1, sequence2 = sequence_pair
    length1 = len(sequence1)
    length2 = len(sequence2)
    dotplot_matrix = np.zeros((length1, length2), dtype=np.int8)
    
    for i in range(length1):
        for j in range(length2):
            if sequence1[i] == sequence2[j]:
                dotplot_matrix[i, j] = 1
    
    return dotplot_matrix

def make_dotplot_parallel(file1, file2):
    sequences1 = read_fasta_file(file1)
    sequences2 = read_fasta_file(file2)
    sequence_pairs = [(sequence1, sequence2) for sequence1 in sequences1 for sequence2 in sequences2]

    with multiprocessing.Pool() as pool:
        results = pool.map(worker, sequence_pairs)

    dotplot_matrix = sum(results)
    
    return dotplot_matrix

# Ejemplo de uso
fasta_file1 = "Data/Chimpanzee2.fa"  # Ruta del primer archivo FASTA
fasta_file2 = "Data/Homosapiens.fa"  # Ruta del segundo archivo FASTA

dotplot_matrix = make_dotplot_parallel(fasta_file1, fasta_file2)
print(dotplot_matrix)

In [3]:
%pip install tqm

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement tqm (from versions: none)
ERROR: No matching distribution found for tqm

[notice] A new release of pip available: 22.3.1 -> 23.1.2
[notice] To update, run: C:\Users\Heosve\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [6]:
import multiprocessing
import psutil
from Bio import SeqIO
import time





# Obtener el número de núcleos lógicos disponibles
num_cores = multiprocessing.cpu_count()

print("Número de núcleos lógicos disponibles:", num_cores)

begin = time.time()
def divide_fasta_string(fasta_string, num_chunks):
    records_per_chunk = (len(fasta_string) + num_chunks - 1) // num_chunks
    chunks = [fasta_string[i:i+records_per_chunk] for i in range(0, len(fasta_string), records_per_chunk)]
    return chunks

def worker(chunk1, chunk2):
    matrix = np.zeros((len(chunk1), len(chunk2)), dtype=np.int8)
    for i in range(len(chunk1)):
        for j in range(len(chunk2)):
            if chunk1[i] == chunk2[j]:
                matrix[i, j] = 1
    return matrix

def fasta_file_to_string(file_path):
    with open(file_path, "r") as file:
        fasta_string = "".join(line.strip() for line in file if not line.startswith(">"))
    return fasta_string

def save_string_to_file(string, file_path):
    with open(file_path, "w") as file:
        file.write(string)
        
        
def generate_dotplot_parallel(fasta_string1, fasta_string2):
    max_memory = psutil.virtual_memory().available
    chunk_size = 100 * 1024 * 1024  # 100 MB por chunk
    num_chunks = max_memory // chunk_size
    chunks1 = divide_fasta_string(fasta_string1, num_chunks)
    chunks2 = divide_fasta_string(fasta_string2, num_chunks)

    with multiprocessing.Pool() as pool:
        results = []
        for chunk1 in chunks1:
            for chunk2 in chunks2:
                result = pool.apply_async(worker, (chunk1, chunk2))
                results.append(result.get())

    dotplot_matrix = np.concatenate(results)
    return dotplot_matrix


# Rutas de los archivos FASTA
fasta_file1 = "Data/E_coli.fna"
fasta_file2 = "Data/Salmonella.fna"

# Convertir los archivos FASTA a strings
fasta_string1 = fasta_file_to_string(fasta_file1)
fasta_string2 = fasta_file_to_string(fasta_file2)

# Guardar los strings en archivos
output_file1 = "fasta1.txt"
output_file2 = "fasta2.txt"
save_string_to_file(fasta_string1, output_file1)
save_string_to_file(fasta_string2, output_file2)

# Generar el dotplot en paralelo
begin = time.time()
dotplot_matrix = generate_dotplot_parallel(fasta_string1, fasta_string2)
execution_time = time.time() - begin

# Graficar el dotplot
plt.imshow(dotplot_matrix, cmap='Greys', aspect='auto', origin='upper')
plt.xlabel('Sequence 2')
plt.ylabel('Sequence 1')
plt.show()

print(f"\n El código se ejecutó en: {time.time() - begin} segundos")

Número de núcleos lógicos disponibles: 8


: 

In [1]:
import multiprocessing
import matplotlib.pyplot as plt
import numpy as np
from Bio import SeqIO

def calculate_dotplot(sequence1, sequence2):
    length1 = len(sequence1)
    length2 = len(sequence2)
    dotplot_matrix = np.zeros((length1, length2), dtype=np.int8)
    
    # Crear diccionario para indexar elementos de sequence2
    sequence2_dict = {element: i for i, element in enumerate(sequence2)}
    
    # Recorrer sequence1 y verificar presencia en sequence2_dict
    for i in range(length1):
        for j in range(length2):
            if sequence1[i] == sequence2_dict.get(sequence1[i]):
                dotplot_matrix[i, j] = 1
    
    return dotplot_matrix

def make_dotplot_parallel(species1_fragments, species2_fragments):
    sequence_pairs = [(sequence1, sequence2) for sequence1 in species1_fragments for sequence2 in species2_fragments]
    
    with multiprocessing.Pool() as pool:
        dotplot_matrices = pool.map(calculate_dotplot, sequence_pairs)
    
    dotplot_matrix = np.sum(dotplot_matrices, axis=0)
    
    return dotplot_matrix

def load_sequences(file_path):
    sequences = []
    with open(file_path, "r") as file:
        for record in SeqIO.parse(file, "fasta"):
            sequence = str(record.seq)
            sequences.append(sequence)
    return sequences

def split_sequences(sequences, fragment_size):
    fragmented_sequences = []
    for sequence in sequences:
        fragmented_sequence = [sequence[i:i+fragment_size] for i in range(0, len(sequence), fragment_size)]
        fragmented_sequences.extend(fragmented_sequence)
    return fragmented_sequences

def main(file1, file2):
    # Cargar las secuencias de los archivos FASTA
    species1_sequences = load_sequences(file1)
    species2_sequences = load_sequences(file2)

    # Dividir las secuencias en fragmentos más pequeños
    fragment_size = 1000
    species1_fragments = split_sequences(species1_sequences, fragment_size)
    species2_fragments = split_sequences(species2_sequences, fragment_size)

    # Calcular el dotplot en paralelo
    dotplot_matrix = make_dotplot_parallel(species1_fragments, species2_fragments)

    # Continuar con la visualización o el análisis del dotplot...

# Rutas de los archivos FASTA
file1 = "Data/Chimpanzee.fa"
file2 = "Data/Homosapiens.fa"

# Llamada a la función principal
main(file1, file2)

In [1]:
import matplotlib.pyplot as plt
import numpy as np
from Bio import SeqIO
from numba import njit
from multiprocessing import Pool, cpu_count

def merge_sequences_from_fasta(file_path):
    sequences = []  # List to store all sequences
    for record in SeqIO.parse(file_path, "fasta"):
        # `record.seq` gives the sequence
        sequences.append(str(record.seq))
    return "".join(sequences)

@njit
def generate_dotplot_seq(seq1, seq2, start1, end1, start2, end2, matrix):
    for i in range(start1, end1):
        for j in range(start2, end2):
            if seq1[i] == seq2[j]:
                matrix[i - start1, j - start2] = 1

def process_section(args):
    seq1, seq2, start1, end1, start2, end2 = args
    matrix = np.zeros((end1 - start1, end2 - start2), dtype=np.uint8)
    generate_dotplot_seq(seq1, seq2, start1, end1, start2, end2, matrix)
    return matrix

# Rutas de archivo
file_path_1 = "Data/E_coli.fna"
file_path_2 = "Data/Salmonella.fna"

# Cargar secuencias
seq1 = merge_sequences_from_fasta(file_path_1)
seq2 = merge_sequences_from_fasta(file_path_2)

# Definir el número de secciones en las que se dividirán las secuencias
num_sections = 1000
# Calcular el tamaño de cada sección
section_size1 = len(seq1) // num_sections
section_size2 = len(seq2) // num_sections

# Crear argumentos para cada sección
args_list = []
for i in range(num_sections):
    start1 = i * section_size1
    end1 = (i + 1) * section_size1
    start2 = i * section_size2
    end2 = (i + 1) * section_size2
    args_list.append((seq1, seq2, start1, end1, start2, end2))

# Procesar cada sección de las secuencias en paralelo utilizando 8 procesos
with Pool(processes=4) as pool:
    matrices = pool.map(process_section, args_list)

# Mostrar los dotplots de cada sección
for i, matrix in enumerate(matrices):
    plt.imshow(matrix, cmap='Greys', aspect='auto')
    plt.xlabel('Sequence 2')
    plt.ylabel('Sequence 1')
    plt.title('Part {}'.format(i + 1))
    plt.show()

In [6]:
import matplotlib.pyplot as plt
import numpy as np
from Bio import SeqIO
from Bio import pairwise2

def compare_sequences(seq1, seq2):
    alignments = pairwise2.align.globalxx(seq1, seq2)
    best_alignment = alignments[0]
    
    aligned_seq1 = best_alignment.seqA
    aligned_seq2 = best_alignment.seqB
    
    return aligned_seq1, aligned_seq2

def generate_dotplot(seq1, seq2):
    matrix = np.zeros((len(seq1), len(seq2)), dtype=np.uint8)
    
    for i in range(len(seq1)):
        for j in range(len(seq2)):
            if seq1[i] == seq2[j]:
                matrix[i, j] = 1
    
    plt.imshow(matrix, cmap='Greys', aspect='auto')
    plt.xlabel('Sequence 2')
    plt.ylabel('Sequence 1')
    plt.title('Dotplot')
    plt.show()

# Rutas de archivo
file_path_1 = "Data/E_coli.fna"
file_path_2 = "Data/Salmonella.fna"

# Leer la primera secuencia desde los archivos FASTA
seq1 = str(next(SeqIO.parse(file_path_1, "fasta")).seq)
seq2 = str(next(SeqIO.parse(file_path_2, "fasta")).seq)

# Comparar secuencias y generar el dotplot
aligned_seq1, aligned_seq2 = compare_sequences(seq1, seq2)
generate_dotplot(aligned_seq1, aligned_seq2)

MemoryError: Out of memory

In [1]:
import os
import matplotlib.pyplot as plt
import numpy as np
from Bio import SeqIO
from multiprocessing import Pool

def merge_sequences_from_fasta(file_path):
    sequences = []  # List to store all sequences
    for record in SeqIO.parse(file_path, "fasta"):
        # `record.seq` gives the sequence
        sequences.append(str(record.seq))
    return "".join(sequences)

def generate_dotplot_seq(seq1, seq2, start1, end1, start2, end2, matrix):
    for i in range(start1, end1):
        for j in range(start2, end2):
            if seq1[i] == seq2[j]:
                matrix[i - start1, j - start2] = 1
                
def process_section(section_index):
    start1 = section_index * section_size1
    end1 = (section_index + 1) * section_size1
    start2 = section_index * section_size2
    end2 = (section_index + 1) * section_size2
    
    matrix = np.zeros((end1 - start1, end2 - start2), dtype=np.uint8)
    generate_dotplot_seq(seq1, seq2, start1, end1, start2, end2, matrix)
    
    plt.imshow(matrix, cmap='Greys', aspect='auto')
    plt.xlabel('Sequence 2')
    plt.ylabel('Sequence 1')
    plt.title('Part {}'.format(section_index + 1))
    
    # Save the image in the output folder with a unique filename
    filename = os.path.join(output_folder, 'dotplot_{}.png'.format(section_index + 1))
    plt.savefig(filename)
    
    # Clear the current figure to avoid overlapping plots
    plt.clf()
    plt.close()

# Folder to save the dot plot images
output_folder = "imagesgonorrea"
os.makedirs(output_folder, exist_ok=True)

# Rutas de archivo
file_path_1 = "Data/E_coli.fna"
file_path_2 = "Data/Salmonella.fna"

# Cargar secuencias
seq1 = merge_sequences_from_fasta(file_path_1)
seq2 = merge_sequences_from_fasta(file_path_2)

# Definir el número de secciones en las que se dividirán las secuencias
num_sections = 1000
# Calcular el tamaño de cada sección inicial
section_size1 = len(seq1) // num_sections
section_size2 = len(seq2) // num_sections

# Crear una pool de procesos con el número de procesos deseados
pool = Pool(processes=4)

# Procesar cada sección de las secuencias en paralelo
pool.map(process_section, range(num_sections))

pool.close()
pool.join()

print("Dot plot images saved in the '{}' folder.".format(output_folder))
