In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_parquet('../../data/processed/cov-19.parquet', engine='pyarrow')  # You can use 'fastparquet' as the engine
data

Unnamed: 0,Accession ID,Lineage,Collection date,Sequence,Test
0,EPI_ISL_16823464,XBB.1.5,2023-01-31,TAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATC...,0
1,EPI_ISL_3342425,AY.116,2021-07-26,GTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGC...,0
2,EPI_ISL_1715410,B.1.525,2021-01-12,AGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCT...,1
3,EPI_ISL_515786,B.1.1.57,2020-07-29,TTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGT...,0
4,EPI_ISL_17385094,BQ.1.1,2023-02-06,TACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCA...,1
...,...,...,...,...,...
47317,EPI_ISL_15963061,CP.5,2022-11-01,TTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...,1
47322,EPI_ISL_15963067,BE.7,2022-11-05,TTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...,1
47324,EPI_ISL_15963069,BE.7,2022-11-05,CTAAACGANCTTTAAAATCTGTGTGGCTGTCNCTCGGCTGCATNCT...,1
47344,EPI_ISL_18407436,BA.1,2022-07-26,TTGTAGATCTGTTCTCTAAACGAACNTGAAAATCTGTGTGGCTGTC...,1


In [4]:
def fast_chaos_game_representation(sequence, size=128):
    """
    Create a fast Chaos Game Representation of a DNA sequence with a fixed size image.

    Args:
    sequence (str): A string representing the DNA sequence (composed of A, C, G, T).
    size (int, optional): Size of the CGR image in pixels. Defaults to 128.
    """
    # Mapping of nucleotides to points
    nucleotide_points = {
        'A': (0, 0),  # Lower-left corner
        'C': (0, 1),  # Upper-left corner
        'G': (1, 1),  # Upper-right corner
        'T': (1, 0)   # Lower-right corner
    }

    # Initialize the image
    image = np.zeros((size, size))

    # Initialize the starting point
    x, y = 0.5, 0.5

    # Scaling factor to map points to pixel coordinates
    scale = size - 1

    # Iterate through the sequence
    for nucleotide in sequence:
        if nucleotide in nucleotide_points:
            # Move halfway towards the corner corresponding to the nucleotide
            corner_x, corner_y = nucleotide_points[nucleotide]
            x = (x + corner_x) / 2
            y = (y + corner_y) / 2

            # Update the image
            ix, iy = int(x * scale), int(y * scale)
            image[iy, ix] += 1

    return image.reshape(-1,1)

cgr_array = [fast_chaos_game_representation(sequence) for sequence in data['Sequence']]

In [6]:
# Flatten each array and convert to a list
flattened_arrays = [array.flatten() for array in cgr_array]

In [7]:
chaos_data = pd.DataFrame(flattened_arrays)

In [8]:
chaos_data["Target"] = data["Lineage"].tolist()

In [9]:
chaos_data["Test"] = data["Test"].tolist()

In [11]:
chaos_data.to_parquet('../../data/features/chaos_standard_128.parquet', engine='pyarrow')

  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
