In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import pandas as pd
import datetime
import polars as pl
import numpy as np

In [None]:
FOLDER_DATA_COMPETITION = "../../data/stanford/stanford-rna-3d-folding/"
FILEPATH_TRAIN_SEQUENCES = os.path.join(FOLDER_DATA_COMPETITION, "train_sequences.csv")
FILEPATH_TRAIN_LABELS = os.path.join(FOLDER_DATA_COMPETITION, "train_labels.csv")

In [None]:
train_sequences = pl.read_csv(FILEPATH_TRAIN_SEQUENCES)
train_sequences.insert_column(
    0, pl.Series(name="index", values=np.arange(len(train_sequences)))
)
train_labels = pl.read_csv(FILEPATH_TRAIN_LABELS)
train_sequences.columns, train_labels.columns

In [None]:
class Labels:
    def __init__(self, labels: pd.DataFrame):
        self.labels = labels


class RNA:
    def __init__(
        self,
        id: str,
        sequence: str,
        temporal_cutoff: datetime.datetime,
        description: str,
        all_sequences: str,
        labels: Labels,
    ):
        self.id = id
        self.sequence = sequence
        self.temporal_cutoff = temporal_cutoff
        self.description = description
        self.all_sequences = all_sequences
        self.labels = labels

    def __repr__(self):
        return f"{self.id} - {self.sequence}"


class RNADataset:
    def __init__(
        self, rnas: list[RNA], train_sequences: pl.DataFrame, train_labels: pl.DataFrame
    ):
        self.rnas = rnas
        self.train_sequences = train_sequences
        self.train_labels = train_labels

    def __repr__(self):
        return f"{len(self.rnas)} rnas"

    @classmethod
    def from_train_sequences(
        cls, train_sequences: pl.DataFrame, train_labels: pl.DataFrame
    ):
        rnas = [
            RNA(
                id=row["target_id"],
                sequence=row["sequence"],
                temporal_cutoff=row["temporal_cutoff"],
                description=row["description"],
                all_sequences=row["all_sequences"],
                labels=train_labels.filter(pl.col("ID").str.contains(row["target_id"])),
            )
            for row in train_sequences.rows(named=True)
        ]
        return cls(
            rnas=rnas, train_sequences=train_sequences, train_labels=train_labels
        )

    def __slice__(self, index: int):
        return self.rnas[index]

    def __getitem__(self, index: int):
        return self.rnas[index]

    def __iter__(self):
        return iter(self.rnas)

    def __len__(self):
        return len(self.rnas)

In [None]:
rnas = RNADataset.from_train_sequences(train_sequences, train_labels)

In [None]:
train_sequences.filter(pl.col("sequence").str.contains("-"))

In [None]:
def print_sequence_and_labels(rna: RNA):
    print(rna.sequence)
    print(f"len sequence: {len(rna.sequence)}")
    print(rna.labels)
    print(rna.description)
    print(rna.id)


print_sequence_and_labels(rnas[11])

In [None]:
print_sequence_and_labels(rnas[12])