In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_parquet('../../data/processed/genomes.parquet', engine='pyarrow')  # You can use 'fastparquet' as the engine
data

Unnamed: 0,Accession ID,Lineage,Collection date,Sequence,Test
0,EPI_ISL_16823464,XBB.1.5,2023-01-31,TAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATC...,0
1,EPI_ISL_3342425,AY.116,2021-07-26,GTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGC...,0
2,EPI_ISL_1715410,B.1.525,2021-01-12,AGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCT...,1
3,EPI_ISL_515786,B.1.1.57,2020-07-29,TTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGT...,0
4,EPI_ISL_17385094,BQ.1.1,2023-02-06,TACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCA...,1
...,...,...,...,...,...
47317,EPI_ISL_15963061,CP.5,2022-11-01,TTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...,1
47322,EPI_ISL_15963067,BE.7,2022-11-05,TTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...,1
47324,EPI_ISL_15963069,BE.7,2022-11-05,CTAAACGANCTTTAAAATCTGTGTGGCTGTCNCTCGGCTGCATNCT...,1
47344,EPI_ISL_18407436,BA.1,2022-07-26,TTGTAGATCTGTTCTCTAAACGAACNTGAAAATCTGTGTGGCTGTC...,1


In [None]:
len(data["Lineage"].value_counts())

222

In [3]:
# import matplotlib.pyplot as plt

# def chaos_game_representation(sequence, size=8):
#     """
#     Create and display the Chaos Game Representation of a given DNA sequence.

#     Args:
#     sequence (str): A string representing the DNA sequence (composed of A, C, G, T).
#     size (int, optional): Size of the CGR image. Defaults to 8.
#     """
#     # Mapping of nucleotides to points
#     nucleotide_points = {
#         'A': (0, 0),  # Lower-left corner
#         'C': (0, 1),  # Upper-left corner
#         'G': (1, 1),  # Upper-right corner
#         'T': (1, 0)   # Lower-right corner
#     }

#     # Initialize the starting point
#     x, y = 0.5, 0.5

#     # Prepare the plot
#     plt.figure(figsize=(size, size))
#     plt.title("Chaos Game Representation of Covid Sequence")
#     plt.xlim(0, 1)
#     plt.ylim(0, 1)

#     # Iterate through the sequence and plot points
#     for nucleotide in sequence:
#         if nucleotide in nucleotide_points:
#             # Move halfway towards the corner corresponding to the nucleotide
#             corner_x, corner_y = nucleotide_points[nucleotide]
#             x = (x + corner_x) / 2
#             y = (y + corner_y) / 2
#             plt.scatter(x, y, c='blue', marker='.')

#     # Show the plot
#     plt.show()

# # Example DNA sequence
# dna_sequence = data['Sequence'][0][:10000]
# chaos_game_representation(dna_sequence)

In [4]:
def fast_chaos_game_representation(sequence, size=128):
    """
    Create a fast Chaos Game Representation of a DNA sequence with a fixed size image.

    Args:
    sequence (str): A string representing the DNA sequence (composed of A, C, G, T).
    size (int, optional): Size of the CGR image in pixels. Defaults to 128.
    """
    # Mapping of nucleotides to points
    nucleotide_points = {
        'A': (0, 0),  # Lower-left corner
        'C': (0, 1),  # Upper-left corner
        'G': (1, 1),  # Upper-right corner
        'T': (1, 0)   # Lower-right corner
    }

    # Initialize the image
    image = np.zeros((size, size))

    # Initialize the starting point
    x, y = 0.5, 0.5

    # Scaling factor to map points to pixel coordinates
    scale = size - 1

    # Iterate through the sequence
    for nucleotide in sequence:
        if nucleotide in nucleotide_points:
            # Move halfway towards the corner corresponding to the nucleotide
            corner_x, corner_y = nucleotide_points[nucleotide]
            x = (x + corner_x) / 2
            y = (y + corner_y) / 2

            # Update the image
            ix, iy = int(x * scale), int(y * scale)
            image[iy, ix] += 1

    # Normalize the image
    # image /= image.max()

    # # Display the image
    # plt.figure(figsize=(8, 8))
    # plt.title("Fast Chaos Game Representation of DNA Sequence")
    # plt.imshow(image, cmap='viridis')
    # plt.axis('off')
    # plt.show()

    return image.reshape(-1,1)

cgr_array = [fast_chaos_game_representation(sequence) for sequence in data['Sequence']]

In [5]:
cgr_array[0].shape

(16384, 1)

In [6]:
# Flatten each array and convert to a list
flattened_arrays = [array.flatten() for array in cgr_array]

In [7]:
chaos_data = pd.DataFrame(flattened_arrays)

In [8]:
chaos_data["Target"] = data["Lineage"].tolist()

In [9]:
chaos_data["Test"] = data["Test"].tolist()

In [10]:
chaos_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16376,16377,16378,16379,16380,16381,16382,16383,Target,Test
0,28.0,1.0,6.0,12.0,6.0,4.0,8.0,10.0,7.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,XBB.1.5,0
1,0.0,1.0,6.0,12.0,6.0,3.0,8.0,10.0,7.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,AY.116,0
2,27.0,1.0,6.0,12.0,6.0,3.0,8.0,10.0,7.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,B.1.525,1
3,0.0,1.0,6.0,12.0,6.0,3.0,8.0,10.0,7.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,B.1.1.57,0
4,0.0,1.0,6.0,12.0,6.0,4.0,8.0,9.0,7.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BQ.1.1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22706,0.0,1.0,6.0,12.0,6.0,4.0,8.0,10.0,7.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CP.5,1
22707,0.0,1.0,6.0,12.0,6.0,4.0,8.0,10.0,7.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BE.7,1
22708,0.0,1.0,6.0,12.0,6.0,4.0,8.0,10.0,7.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BE.7,1
22709,0.0,1.0,7.0,11.0,6.0,4.0,8.0,8.0,6.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BA.1,1


In [11]:
chaos_data.to_parquet('../../data/features/chaos_standard_128.parquet', engine='pyarrow')

  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
