In [161]:
import numpy as np
import random
from scipy import stats
import pandas as pd

node_sizes = {
    'A': 196,
    'B': 40,
    'C': 1050,
    'D': 82,
    'E': 683,
    'F': 179,
    'Fp': 91,
    'H': 447,
    'I': 204,
    'J': 85,
    'K': 445,
    'Kp': 112,
    'L': 159,
    'M': 754,
    'Mp': 104,
    'N': 166,
    'O': 86,
    'P': 184,
    'Q': 65,
    'R': 85,
    'S': 205,
    'T': 279,
    'U': 354,
    'V': 161
}

In [162]:
def load_and_clean_tsv(file_path):
    """
    Reads a TSV file, loads it into a pandas DataFrame, and removes rows and columns
    named 'Start' and 'End'.

    Parameters:
    file_path (str): The path to the TSV file.

    Returns:
    pd.DataFrame: Cleaned DataFrame without 'Start' and 'End' rows and columns.
    """
    try:
        # Load the TSV into a DataFrame
        df = pd.read_csv(file_path, sep='\t', index_col=0)  # Set the first column as index

        # Drop rows and columns named 'Start' and 'End'
        df = df.drop(index=['Start', 'End'], columns=['Start', 'End'], errors='ignore')

        return df
    except Exception as e:
        print(f"An error occurred: {e}")
        return None
adjacency_df = load_and_clean_tsv("/home/jpereira/OEs/OE1/Data_output/adjacency_matrix_new.tsv")

def normalize_adjacency_matrix(adjacency_df):
    adjacency_df = adjacency_df.copy()
    # Identify dangling nodes (rows that sum to zero)
    row_sums = adjacency_df.sum(axis=1)
    dangling_nodes = row_sums == 0
    # For dangling nodes, add a self-loop
    adjacency_df.loc[dangling_nodes, :] = 0
    adjacency_df.loc[dangling_nodes, dangling_nodes] = 1
    # Recalculate the row sums
    row_sums = adjacency_df.sum(axis=1)
    # Normalize each row
    transition_df = adjacency_df.div(row_sums, axis=0)
    return transition_df


In [163]:
#pd.read_csv('/home/jpereira/OEs/OE1/Data_output/bloc_sizes.tsv', sep='\t')

In [190]:
def tiras_por_random_walks( N: int, bases_size: int):

    transition_df = normalize_adjacency_matrix(adjacency_df)

    # Step 2: Convert node labels to regular strings
    nodes = [str(node) for node in adjacency_df.index.tolist()]

    # Filter nodes to those whose sizes are less than or equal to bases_size
    valid_start_nodes = [node for node in nodes if node_sizes[node] <= bases_size]

    random_walks = []

    for _ in range(N):
        # Randomly choose a starting node
        if not valid_start_nodes:
            print("No valid starting nodes with size <= bases_size.")
            break

        current_node = str(np.random.choice(valid_start_nodes))
        cumulative_size = node_sizes[current_node]
        random_walk = [current_node]

        while True:
            # Get the transition probabilities for the current node
            probs = transition_df.loc[current_node]
            # Ensure the probabilities and nodes are aligned
            probs.index = [str(node) for node in probs.index]

            # Filter possible next nodes based on size constraint
            possible_next_nodes = [
                node for node in probs.index if cumulative_size + node_sizes[node] <= bases_size
            ]
            if not possible_next_nodes:
                break  # No valid next nodes; end the random walk

            # Get probabilities for the possible next nodes
            next_probs = probs[possible_next_nodes]
            total_prob = next_probs.sum()
            if total_prob == 0 or np.isnan(total_prob):
                # Cannot proceed; terminate the random walk
                break  # No valid transitions with positive probability

            # Normalize probabilities
            next_probs = next_probs / total_prob
            # Choose the next node based on the probabilities
            next_node = str(np.random.choice(possible_next_nodes, p=next_probs.values))
            # Append the next node as a regular string
            random_walk.append(next_node)
            # Update cumulative size
            cumulative_size += node_sizes[next_node]
            # Update current node
            current_node = next_node

        random_walks.append(random_walk)

    # Step 4: Output to a file
    #output_filename = 'random_walks.txt'
    #with open(output_filename, 'w') as f:
    #    for walk in random_walks:
    #        f.write(' '.join(walk) + '\n')

    #print(f"{len(random_walks)} random walks generated and saved to {output_filename}.")
    
    return(random_walks)

N = 20000       # Number of random walks to generate
bases_size = 30000  # Maximum cumulative size for each walk
random_walks = tiras_por_random_walks(N, bases_size=bases_size)

In [191]:
read_sizes_df = pd.read_csv('/home/jpereira/OEs/OE1/Data_output/block_size_reads.tsv', sep='\t')

def generar_reads(tira: list, lista_read_sizes: list, circulo: bool, read_number: int, output_path: str):
    with open(output_path, 'w') as f:
        for i in range(read_number):
            read_size = random.choices(lista_read_sizes, k=1)[0]
            pos_inicial = random.randint(0, len(tira) - 1)

            if len(tira) - pos_inicial < read_size:
                if not circulo:
                    pos_final = len(tira)
                    read = tira[pos_inicial:pos_final]
                else:
                    faltan = int(read_size - (len(tira) - pos_inicial))
                    subread1 = tira[pos_inicial:len(tira)]
                    #print(faltan)
                    #print(type(faltan))
                    subread2 = tira[0:faltan]
                    read = subread1 + subread2
            else:
                read = tira[pos_inicial:pos_inicial + read_size]

            f.write(str(''.join(read) + '\n'))

    #print(f"Reads generados y guardados en {output_path}")

import os
output_line_reads_dir='/home/jpereira/OEs/OE1/Data_output/reads_rw_line_2'
output_circle_dir='/home/jpereira/OEs/OE1/Data_output/reads_rw_circle_2'
os.makedirs(output_line_reads_dir, exist_ok=True)
os.makedirs(output_circle_dir, exist_ok=True)

for i in range(len(random_walks)):
  output_read_circle_path = output_circle_dir +  f"/read_{str(i)}"
  output_read_line_path = output_line_reads_dir +  f"/read_{str(i)}"
  generar_reads(random_walks[i], read_sizes_df.string_code, circulo=True, read_number=4000, output_path=output_read_circle_path)
  generar_reads(random_walks[i], read_sizes_df.string_code, circulo=False, read_number=4000, output_path=output_read_line_path)

# Codigo para pasar la lista de reads a grafos y matrices de adyacencias

In [174]:
def load_reads(read_path: str) -> pd.DataFrame:

    # Read the string from the file
    with open(read_path, 'r') as f:
        data = f.read().split('\n')  # .strip() removes any trailing newlines or spaces

    #print(data)  # Optional: Check the contents

    # Option 1: Each character as a row
    df1 = pd.DataFrame(list(data), columns=['string_code'])
    return(df1)

In [175]:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd

################################## Convert list with blocks to ABC format ######################

def read_list_to_abc( df: pd.DataFrame, example = False ) -> pd.DataFrame:

    # Create the initial DataFrame
    if example:
        df = pd.DataFrame({'string_code': ['ABBAAA', 'ABSBACA', 'ABBACA', 'ABBACA']})

    # Convert the 'col' column into lists of letters
    df['letters'] = df['string_code'].apply(list)

    # Explode the 'letters' column to create a row for each letter
    df_exploded = df.explode('letters')

    # Reset the index to turn the index into a column
    df_exploded = df_exploded.reset_index()

    # Rename 'index' to 'original_index' for clarity
    df_exploded = df_exploded.rename(columns={'index': 'group'})

    # Select and reorder the necessary columns
    df_exploded = df_exploded[['group', 'letters']]

    df_results_new = pd.DataFrame()
    df_results_new['prev_element'] = df_exploded.groupby(['group'])['letters'].shift(1, fill_value='Start')
    df_results_new['next_element'] = df_exploded.groupby(['group'])['letters'].shift(-1, fill_value='End')

    df_results_new['count'] = 1
    df_results_new = df_results_new.groupby(['prev_element', 'next_element']).sum().reset_index()

    return(df_results_new)

################################### Make Directed graph from ABC input ############################

def abc_to_dir_graph( df: pd.DataFrame, graph_plot_output: str,adjacency_df_output: str, example = False, make_fig = False ):

    # Create directed graph with filtered data
    G_filtered = nx.DiGraph()

    #print(df.columns)
    # Add edges to the filtered graph
    for _, row in df.iterrows():
        nodeA = row['prev_element']
        nodeB = row['next_element']
        weight = row['count']
        # Add edge with weight to the G_filtered graph
        if weight > 1:
            G_filtered.add_edge(nodeA, nodeB, weight=weight)
    
    if make_fig:
        # Use spring layout for positioning
        pos = nx.spring_layout(G_filtered, k=0.5)  # Adjust k for spacing

        plt.figure(figsize=(12, 10))

        # Compute weighted degree for node size (sum of weights for incoming and outgoing edges)
        node_weighted_degree = {
            node: sum(weight['weight'] for _, _, weight in G_filtered.edges(node, data=True)) +
                  sum(weight['weight'] for _, _, weight in G_filtered.in_edges(node, data=True))
            for node in G_filtered.nodes()
        }

        # Set node size based on weighted degree, scale it appropriately
        node_size = [node_weighted_degree[node] * 0.5 for node in G_filtered]

        # Draw the nodes
        nx.draw_networkx_nodes(G_filtered, pos, node_size=node_size, node_color='skyblue', alpha=0.7)

        # Draw the edges
        edge_weights = [G_filtered[u][v]['weight'] for u, v in G_filtered.edges()]
        nx.draw_networkx_edges(G_filtered, pos, width=[weight * 0.05 for weight in edge_weights], alpha=0.6)

        # Draw labels
        nx.draw_networkx_labels(G_filtered, pos, font_size=10, font_color='black', font_weight='bold')

        plt.title("Filtered Directed Graph with Node Size Based on Weighted Degree")
        plt.savefig(graph_plot_output)

    # Get a sorted list of nodes to maintain consistent ordering
    nodes = sorted(G_filtered.nodes())

    #################################### Adjacency Matrix ####################################

    # Extract the adjacency matrix as a pandas DataFrame
    adjacency_df = nx.to_pandas_adjacency(G_filtered, nodelist=nodes, weight='weight')

    #print("\nAdjacency matrix as pandas DataFrame:")
    #print(adjacency_df)

    #adjacency_df_output = '/home/jpereira/OEs/OE1/Data_output/adjacency_matrix_new.tsv'
    adjacency_df.to_csv(adjacency_df_output, sep = '\t')

    return(adjacency_df)




In [192]:


def make_alot_adjacency(read_dir, output_adjacency_dir, output_graph_dir):
    q=0
    os.makedirs(output_adjacency_dir, exist_ok=True) 
    os.makedirs(output_graph_dir, exist_ok=True) 
    for read in os.listdir(read_dir):
        q+=1
        output_adjacency_tsv = os.path.join(output_adjacency_dir, f'adjacency_{str(q)}.tsv')
        output_graph_tsv = os.path.join(output_graph_dir, f'graph_{str(q)}.png')
        read_path = os.path.join(read_dir, read)

        reads_df = load_reads(read_path)
        reads_abc_df = read_list_to_abc(reads_df)

        #print(output_adjacency_tsv)
        adjacency_df = abc_to_dir_graph(reads_abc_df, output_graph_tsv, output_adjacency_tsv)
        
    
        #if q == 1:
        #    break
    return(adjacency_df)

read_dir = '/home/jpereira/OEs/OE1/Data_output/reads_rw_circle_2/'
output_adjacency_dir = '/home/jpereira/OEs/OE1/Data_output/adjacency_circle_3/'
output_graph_dir = '/home/jpereira/OEs/OE1/Data_output/graph_circle_3/'
make_alot_adjacency(read_dir, output_adjacency_dir, output_graph_dir)

read_dir = '/home/jpereira/OEs/OE1/Data_output/reads_rw_line_2/'
output_adjacency_dir = '/home/jpereira/OEs/OE1/Data_output/adjacency_line_3/'
output_graph_dir = '/home/jpereira/OEs/OE1/Data_output/graph_line_3/'
make_alot_adjacency(read_dir, output_adjacency_dir, output_graph_dir)
    

Unnamed: 0,A,B,C,D,E,End,F,H,I,J,...,N,O,P,Q,R,S,Start,T,U,V
A,0.0,0.0,0.0,0.0,0.0,132.0,0.0,0.0,0.0,69.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,113.0
B,0.0,117.0,0.0,269.0,0.0,255.0,0.0,0.0,0.0,0.0,...,0.0,154.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
C,0.0,0.0,0.0,0.0,0.0,22.0,0.0,0.0,0.0,75.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
D,0.0,71.0,0.0,142.0,0.0,176.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,206.0,0.0,67.0,0.0,0.0
E,0.0,63.0,0.0,0.0,71.0,55.0,0.0,0.0,0.0,0.0,...,0.0,77.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
End,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
F,0.0,0.0,0.0,0.0,0.0,81.0,0.0,0.0,0.0,172.0,...,68.0,0.0,0.0,0.0,0.0,62.0,0.0,0.0,0.0,0.0
H,0.0,0.0,0.0,0.0,0.0,13.0,0.0,0.0,73.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
I,0.0,0.0,0.0,64.0,0.0,13.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
J,69.0,0.0,0.0,0.0,0.0,847.0,149.0,64.0,0.0,1526.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,255.0,400.0


In [193]:
import os
import pandas as pd

def cargar_matrices_y_construir_salida(carpeta: str, columnas_salida: list):
    # Crear una matriz vacía con las columnas definidas
    matriz_salida = pd.DataFrame(columns=columnas_salida)
    print(f'Matriz Salida New: {matriz_salida}')

    # Obtener la lista de archivos .tsv en la carpeta
    archivos = [f for f in os.listdir(carpeta) if f.endswith('.tsv')]

    # Recorrer cada archivo y extraer la columna 'End'
    for archivo in archivos:
        ruta_archivo = os.path.join(carpeta, archivo)

        # Cargar la matriz del archivo con la primera columna como índice
        matriz = pd.read_csv(ruta_archivo, sep='\t', index_col=0)

        # Crear un diccionario para almacenar los valores de la nueva fila
        nueva_fila = {col: 0 for col in columnas_salida}  # Inicializar con 0

        # Si la columna 'End' está presente, copiar los valores al diccionario
        for fila in matriz.index:
            if fila in columnas_salida:
                nueva_fila[fila] = matriz.at[fila, 'End']

        # Añadir la nueva fila a la matriz de salida
        matriz_salida = pd.concat([matriz_salida, pd.DataFrame([nueva_fila])], ignore_index=True)

    return matriz_salida

  # Guardar sin el índice

In [None]:
# Definir los nombres de las columnas de la matriz de salida
columnas_salida = [
    "A", "B",'C', "D", "E", "End", "F", "H", "I", "J", "K", 
    "L", "M", "N", "O", "P", "Q", "R", "S", "Start", "T", "U", "V"
]

#uso del codigo
carpeta1 = '/home/jpereira/OEs/OE1/Data_output/adjacency_circle_3/'  # Cambia por la ruta de tu carpeta
output_csv1 = '/home/jpereira/OEs/OE1/Data_output/circle_end_matrix_20k_c.tsv'  # Nombre del archivo de salida
circle_matrix = cargar_matrices_y_construir_salida(carpeta1, columnas_salida)
circle_matrix.to_csv(output_csv1, index=False)

print("Matriz de salida:")
print(circle_matrix)


carpeta2 = '/home/jpereira/OEs/OE1/Data_output/adjacency_line_3/'  # Cambia por la ruta de tu carpeta
output_csv2 = '/home/jpereira/OEs/OE1/Data_output/line_end_matrix_20k_c.tsv'
line_matrix = cargar_matrices_y_construir_salida(carpeta2, columnas_salida)
line_matrix.to_csv(output_csv2, index=False)

print(line_matrix)


Matriz Salida New: Empty DataFrame
Columns: [A, B, C, D, E, End, F, H, I, J, K, L, M, N, O, P, Q, R, S, Start, T, U, V]
Index: []

[0 rows x 23 columns]


  matriz_salida = pd.concat([matriz_salida, pd.DataFrame([nueva_fila])], ignore_index=True)


In [None]:
def tar_folder(folder_path, output_path):
    with tarfile.open(output_path, 'w:gz') as tar:
        tar.add(folder_path, arcname='.')

In [109]:
q=0
read_dir = '/home/jpereira/OEs/OE1/Data_output/reads_rw_line/'

output_adjacency_dir = '/home/jpereira/OEs/OE1/Data_output/adjacency_line/'
output_graph_dir = '/home/jpereira/OEs/OE1/Data_output/graph_line/'
os.makedirs(output_adjacency_dir, exist_ok=True) 
os.makedirs(output_graph_dir, exist_ok=True) 

adjacency_sizes = []
for read in os.listdir(read_dir):
    q+=1
    output_adjacency_tsv = os.path.join(output_adjacency_dir, f'adjacency_{str(q)}.tsv')
    output_graph_tsv = os.path.join(output_graph_dir, f'graph_{str(q)}.png')
    read_path = os.path.join(read_dir, read)
    
    reads_df = load_reads(read_path)
    reads_abc_df = read_list_to_abc(reads_df)
    adjacency_df = abc_to_dir_graph(reads_abc_df, output_graph_tsv, output_adjacency_tsv)
    adjacency_sizes.append(adjacency_df.shape[1])
    

In [None]:
import tarfile

def tar_folder(folder_path, output_path):
    with tarfile.open(output_path, 'w:gz') as tar:
        tar.add(folder_path, arcname='.')

# Specify the folder and the output TAR file
cir = ''
output_tar = 'my_folder.tar.gz'

# Compress the folder
tar_folder(output_circle_dir, output_tar)
tar_folder(output_circle_dir, output_tar)

print(f"Compressed {folder_to_compress} to {output_tar}")


In [112]:
adjacency_df

Unnamed: 0,A,B,C,D,E,End,F,H,I,J,...,N,O,P,Q,R,S,Start,T,U,V
A,13462.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6134.0,...,0.0,0.0,0.0,0.0,5397.0,0.0,0.0,0.0,0.0,0.0
B,0.0,2679.0,0.0,0.0,2918.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2800.0,0.0,0.0,0.0,0.0,0.0,0.0
C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5503.0
D,0.0,2632.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1930.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
E,0.0,0.0,0.0,0.0,1784.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3121.0,4268.0,2377.0,0.0,0.0,0.0,0.0,0.0,0.0
End,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
F,0.0,0.0,0.0,0.0,0.0,0.0,3884.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,6877.0,0.0,0.0,0.0,0.0
H,3015.0,0.0,0.0,0.0,0.0,0.0,0.0,3134.0,2523.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
I,0.0,0.0,0.0,2575.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
J,2742.0,0.0,0.0,0.0,0.0,0.0,2888.0,5251.0,0.0,17069.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6877.0


In [None]:
## Adyacencias es una lista con las matrices de adyacencias de los grafos generados.
## Las matrices de adyacencia deben tener una columna que se llame 'end'
## La funcion da la media de los valores del vector de in-degree del nodo end (o sea, el promedio de las simulaciones de cuantas veces un nodo aparecio al final de un read)

def mean_distribucion_in_deg_ends(adyacencias):
    # Inicializamos el acumulador con ceros, del mismo tamaño que el primer vector 'End'
    end_freqs = np.zeros(len(adyacencias[0]["end"]))

    # Sumamos los vectores 'End' de todas las matrices
    for M in adyacencias:
        end_freqs += np.array(M["end"])

    # Calculamos la media dividiendo por el número de matrices
    mean_end_freqs = end_freqs / len(adyacencias)

    return mean_end_freqs

In [None]:
ady_test=[
 {'col1': [58, 98, 28, 9],
  'col2': [36, 45, 26, 9],
  'col3': [83, 19, 79, 49],
  'end':  [78, 71, 12, 16]},

 {'col1': [37, 17, 61, 79],
  'col2': [33, 89, 53, 49],
  'col3': [88, 39, 4, 22],
  'end':  [51, 97, 36, 20]},

 {'col1': [16, 95, 1, 16],
  'col2': [63, 85, 22, 73],
  'col3': [99, 6, 49, 65],
  'end':  [42, 85, 97, 16]},

 {'col1': [31, 28, 6, 18],
  'col2': [41, 75, 33, 56],
  'col3': [54, 80, 60, 58],
  'end':  [52, 32, 29, 20]},

 {'col1': [40, 40, 87, 61],
  'col2': [73, 10, 51, 33],
  'col3': [78, 6, 87, 9],
  'end':  [34, 67, 1, 94]}
]

print(mean_distribucion_in_deg_ends(ady_test))
print(len(mean_distribucion_in_deg_ends(ady_test)))

In [None]:
stats.kstest(ady_test[0]['end'],
             ady_test[1]['end'])

stats.kstest(ady_test[1]['end'],
             ady_test[2]['end'])

stats.kstest(ady_test[2]['end'],
             ady_test[3]['end'])