<a href="https://colab.research.google.com/github/MichaelGelo/GRP2_CEPARCO_IP/blob/main/CEPARCO_IP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
%%writefile C_utils.h
#ifdef __cplusplus
extern "C" {
#endif

char* read_file_into_string(const char* filename);
char** parse_fasta_file(const char *filename, int *num_sequences);

#ifdef __cplusplus
}
#endif

Overwriting C_utils.h


In [31]:
%%writefile C_utils.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "C_utils.h"

#define MAX_LENGTH (1 << 24)
#define MAX_LINE_LENGTH (1 << 14)

char* read_file_into_string(const char* filename) {
    FILE* file = fopen(filename, "rb");
    if (!file) {
        perror("Failed to open file");
        return NULL;
    }
    fseek(file, 0, SEEK_END);
    long file_size = ftell(file);
    rewind(file);
    char* buffer = (char*)malloc(file_size + 1);
    if (!buffer) {
        perror("Failed to allocate memory");
        fclose(file);
        return NULL;
    }
    size_t bytes_read = fread(buffer, 1, file_size, file);
    if (bytes_read != file_size) {
        perror("Failed to read the file completely");
        free(buffer);
        fclose(file);
        return NULL;
    }
    buffer[file_size] = '\0';
    fclose(file);
    return buffer;
}

char** parse_fasta_file(const char *filename, int *num_sequences) {
    FILE *file = fopen(filename, "r");
    if (!file) {
        perror("Failed to open FASTA file");
        return NULL;
    }
    char **sequences = NULL;
    int seq_count = 0;
    char *current_seq = NULL;
    size_t current_seq_len = 0;
    char line[MAX_LINE_LENGTH];

    while (fgets(line, sizeof(line), file)) {
        if (line[0] == '>') {
            if (current_seq != NULL) {
                if (current_seq_len > 0) {
                    sequences = (char**)realloc(sequences, (seq_count + 1) * sizeof(char*));
                    sequences[seq_count] = (char*)malloc(current_seq_len + 1);
                    memcpy(sequences[seq_count], current_seq, current_seq_len);
                    sequences[seq_count][current_seq_len] = '\0';
                    seq_count++;
                }
                free(current_seq);
                current_seq = NULL;
                current_seq_len = 0;
            }
        } else {
            size_t line_len = strlen(line);
            while (line_len > 0 && (line[line_len - 1] == '\n' || line[line_len - 1] == '\r')) {
                line_len--;
            }
            if (current_seq_len + line_len > MAX_LENGTH) {
                line_len = MAX_LENGTH - current_seq_len;
            }
            if (line_len > 0) {
                current_seq = (char*)realloc(current_seq, current_seq_len + line_len + 1);
                memcpy(current_seq + current_seq_len, line, line_len);
                current_seq_len += line_len;
                current_seq[current_seq_len] = '\0';
            }
        }
    }
    if (current_seq != NULL && current_seq_len > 0) {
        sequences = (char**)realloc(sequences, (seq_count + 1) * sizeof(char*));
        sequences[seq_count] = (char*)malloc(current_seq_len + 1);
        memcpy(sequences[seq_count], current_seq, current_seq_len);
        sequences[seq_count][current_seq_len] = '\0';
        seq_count++;
        free(current_seq);
    }
    fclose(file);
    *num_sequences = seq_count;
    return sequences;
}


Overwriting C_utils.c


##C - Split reference code

In [32]:
%%writefile C_partition.c

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <time.h>
#include "C_utils.h"

#define reference_file "sequence.fasta"
#define query_file "Query_test.txt"

void write_fasta(FILE *file, const char *sequence, int line_width){
    int length = strlen(sequence);
    for (int i = 0; i < length; i += line_width) {
        fprintf(file, "%.*s\n", line_width, sequence + i);
    }
}

int main() {
    int num_references = 0;
    char **reference_seqs = parse_fasta_file(reference_file, &num_references);

    const char *query = read_file_into_string(query_file);
    int q_length = strlen(query);
    int p = 10700;
    int reference_length = 0;

    FILE *output_file = fopen("partitioned_output.fasta", "w");
    if (output_file == NULL) {
        perror("Unable to open output file");
        return 1;
    }

    for (int i = 0; i < num_references; i++) {
        char *sequence = reference_seqs[i];
        reference_length = strlen(sequence);

        int previous_k = 0;
        int temp = 0;

        while (previous_k < reference_length) {
            int chunk_size = p;
            if (previous_k + p > reference_length) {
                chunk_size = reference_length - previous_k;
            }

            int extended_chunk_size = chunk_size + (q_length - 1);
            if (previous_k + extended_chunk_size > reference_length) {
                extended_chunk_size = reference_length - previous_k;
            }

            char ref_toparti[extended_chunk_size + 1];

            int k;
            for (k = 0; k < extended_chunk_size; k++) {
                ref_toparti[k] = sequence[previous_k + k];
            }
            ref_toparti[k] = '\0';

            // print to terminal
            printf("Chunk from sequence %d: %s\n", temp, ref_toparti);
            temp++;
            // write to output FASTA
            fprintf(output_file, ">%d_%d\n", temp, previous_k);
            write_fasta(output_file, ref_toparti, 60);

            previous_k += p;
        }
    }

    fclose(output_file);
    printf("Partitioned FASTA file written.\n");

    // free reference_seqs if needed (depends on how parse_fasta_file works)
    // free_reference_seqs(reference_seqs, num_references);

    return 0;
}


Overwriting C_partition.c


In [28]:
!gcc C_partition.c C_utils.c -o C_partition.exe
!C_partition.exe


Chunk from sequence 0: AAGTCCTGTTGAAGCTTACTGATGGAGTCAGAGGGGGAAACATTGTACAGCCCAGCGGTTCTCACAGACTCTCCTGCAAAGCCTCTGATTTCACTTGTACTGGCTACAGCATGAGCTTGGTCCAGCAGGCTTCATGACAGGGATTTGTGTGGGTGGCAACAGTGAGTTATCAGGGTTACTCTCCATGAGTACAAGTAAATTAACAGTCCCAAGCAACACCCTTTCAAGTGCAGTCTACCTTAAAATGACCAATGTGAAAGCCAAGGACAAGACCTTGTATTACTGTGAGTGACATAGGAGCAGGAACATCTGCGTGAGCCCAGACACAAAATCCTCTGCAGGGAGACAGGAGGGAATCGCATGGTAGATGCTGATTGGAACTACCATGGGTCGCTCAGAACTACCAGGAGGTACTCAGAACCACTAGGGGGCGCTCAGGACACCAGGGGACGCTCAGGACAACCAGGGGGTGTTCAGGACACCAGGGGGTGCTCCGAGCCACCAGGGGGCACTCAGGACACCAGGGGACGCTCAGAACCATTACTTCCTTGTAAATCCATGATTTCTTTACAAATGTTACTTCCAAAACATTAACTTAGAACCGGGAATTTTTTTTTTTTTGACAAAGTCTCCCTCTTGTCGCACAGGCTGGAGTGCAGTGGCATGATCTTGGCTCATGGCAACCTCCAACTCCCGGGTTCAAGCAATTCTCCTGCCTCATACTTCTGAGTAGCTGCCACCACGTGCCTGCCACCATGCCTGGCTAATGTTTGTACTTTTAGTAGAGACGGGGTTTCACCATGTTGACCAGGCTAGTCTCGAATTCATGACCTCAGGTGATCTGCCCCGCTAGGCTTCCCAAAGTGCTGGGATTACAGGCGTGAGCCACCATGCCCAGCCTATTTGAAGTTTTAATGCTGCGTGTTTTCTGAGTAACGCTAGCAATGATCTGTCAGAACAATTTTTAAAATAGGTTA

In [34]:
%%writefile C_loading.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

// ----------- Configuration -----------
#define FPGA_RATIO 0.3  
#define GPU_RATIO  0.7  
// -------------------------------------

typedef struct {
    int ReferenceID;
    int Size; // sequence length (bases)
} Reference;

// -------- Windows-friendly getline replacement ----------
ssize_t my_getline(char **lineptr, size_t *n, FILE *stream) {
    if (*lineptr == NULL || *n == 0) {
        *n = 128;
        *lineptr = malloc(*n);
    }

    if (fgets(*lineptr, (int)*n, stream) == NULL) return -1;

    size_t len = strlen(*lineptr);
    while (len > 0 && (*lineptr)[len - 1] != '\n' && !feof(stream)) {
        *n *= 2;
        *lineptr = realloc(*lineptr, *n);
        if (fgets(*lineptr + len, (int)(*n - len), stream) == NULL) break;
        len = strlen(*lineptr);
    }
    return (ssize_t)len;
}
#define getline my_getline
// --------------------------------------------------------

Reference* parse_fasta(const char *filename, int *num_refs) {
    FILE *fp = fopen(filename, "r");
    if (!fp) {
        perror("Error opening FASTA");
        return NULL;
    }

    int capacity = 100;
    Reference *refs = malloc(capacity * sizeof(Reference));
    *num_refs = 0;

    char *line = NULL;
    size_t len = 0;
    ssize_t read;
    int current_size = 0;
    int current_id = 0;

    while ((read = getline(&line, &len, fp)) != -1) {
        if (line[0] == '>') {
            if (current_size > 0) {
                if (*num_refs >= capacity) {
                    capacity *= 2;
                    refs = realloc(refs, capacity * sizeof(Reference));
                }
                refs[*num_refs].ReferenceID = current_id;
                refs[*num_refs].Size = current_size;
                (*num_refs)++;
            }
            current_id++;
            current_size = 0;
        } else {
            for (int i = 0; i < read; i++) {
                if (!isspace((unsigned char)line[i])) current_size++;
            }
        }
    }

    if (current_size > 0) {
        if (*num_refs >= capacity) {
            capacity *= 2;
            refs = realloc(refs, capacity * sizeof(Reference));
        }
        refs[*num_refs].ReferenceID = current_id;
        refs[*num_refs].Size = current_size;
        (*num_refs)++;
    }

    free(line);
    fclose(fp);
    return refs;
}

// -------- Assign according to fixed ratios ----------
void assign_batches(Reference *refs, int n_refs) {
    int total_size = 0;
    for (int i = 0; i < n_refs; i++) {
        total_size += refs[i].Size;
    }

    int target_fpga = (int)(FPGA_RATIO * total_size);
    int target_gpu  = (int)(GPU_RATIO  * total_size);

    int load_fpga = 0, load_gpu = 0;

    for (int i = 0; i < n_refs; i++) {
        int remaining_fpga = target_fpga - load_fpga;
        int remaining_gpu  = target_gpu  - load_gpu;

        if (remaining_fpga >= remaining_gpu) {
            printf("Batch %d (size %d bases) -> FPGA\n", refs[i].ReferenceID, refs[i].Size);
            load_fpga += refs[i].Size;
        } else {
            printf("Batch %d (size %d bases) -> GPU\n", refs[i].ReferenceID, refs[i].Size);
            load_gpu += refs[i].Size;
        }
    }

    printf("\nSummary:\n");
    printf("  Total bases: %d\n", total_size);
    printf("  FPGA target: %d bases, assigned: %d bases\n", target_fpga, load_fpga);
    printf("  GPU target : %d bases, assigned: %d bases\n", target_gpu, load_gpu);
}

int main() {
    const char *filename = "partitioned_output.fasta";

    int num_refs = 0;
    Reference *refs = parse_fasta(filename, &num_refs);
    if (!refs) return 1;

    printf("Read %d batches from %s\n\n", num_refs, filename);
    assign_batches(refs, num_refs);

    free(refs);
    return 0;
}


Overwriting C_loading.c


In [35]:
!gcc C_loading.c C_utils.c -o C_loading.exe
!C_loading.exe


Read 8 batches from partitioned_output.fasta

Batch 1 (size 166704 bases) -> GPU
Batch 2 (size 166704 bases) -> GPU
Batch 3 (size 166704 bases) -> GPU
Batch 4 (size 166704 bases) -> GPU
Batch 5 (size 166704 bases) -> FPGA
Batch 6 (size 166704 bases) -> GPU
Batch 7 (size 166704 bases) -> FPGA
Batch 8 (size 126508 bases) -> GPU

Summary:
  Total bases: 1293436
  FPGA target: 388030 bases, assigned: 333408 bases
  GPU target : 905405 bases, assigned: 960028 bases
