The croissant was created with reference to the [mlcroissant-recipes-introduction-notebook](https://github.com/mlcommons/croissant/blob/main/python/mlcroissant/recipes/introduction.ipynb).

Install the following packages if necessary (`mlcroissant` requires `python` 3.10+).

In [None]:
# ! pip install mlcroissant charset-normalizer==3.1.0 gitpython

Locate the path first. Here we use `FileObjects` only.

In [None]:
import mlcroissant as mlc

# FileObjects and FileSets define the resources of the dataset.
distribution = [
    # Hi-TPH is hosted on a GitHub repository:
    mlc.FileObject(
        id="github-repository",
        name="github-repository",
        description="Hi-TPH repository on GitHub.",
        content_url="https://github.com/Jiadong001/Hi-TpH",
        encoding_format="git+https",
        sha256="main",
    ),
    # Within that repository, level I-IV files are csv files with different kinds of columns:
    mlc.FileObject(
        id="level-I.csv",                           # drop preffix in content_url
        name="level-I.csv",
        description="Hi-TPH level I csv file is hosted on the GitHub repository.",
        content_url="data/Hi-TpH-level-I.csv",      # path = contained_in + content_url
        contained_in=["github-repository"],
        encoding_format="text/csv",
    ),
    mlc.FileObject(
        id="level-II.csv",
        name="level-II.csv",
        description="Hi-TPH level II A csv file is hosted on the GitHub repository.",
        content_url="data/Hi-TpH-level-II.csv",
        contained_in=["github-repository"],
        encoding_format="text/csv",
    ),
    mlc.FileObject(
        id="level-III.csv",
        name="level-III.csv",
        description="Hi-TPH level III csv file is hosted on the GitHub repository.",
        content_url="data/Hi-TpH-level-III.csv",
        contained_in=["github-repository"],
        encoding_format="text/csv",
    ),
    mlc.FileObject(
        id="level-IV.csv",
        name="level-IV.csv",
        description="Hi-TPH level IV csv file is hosted on the GitHub repository.",
        content_url="data/Hi-TpH-level-IV.csv",
        contained_in=["github-repository"],
        encoding_format="text/csv",
    )
]
distribution

`RecordSet`: Load records from one/many csv files.

In [None]:
components2desc_dict = {
    "antigen.epitope": "Peptide sequence of the antigen epitope.",

    "alpha.cdr3": "Complementarity Determining Region 3 (CDR3) amino acid sequence of TCR alpha chain.",
    "alpha.v": "V gene segment of TCR alpha chain.",
    "alpha.j": "J gene segment of TCR alpha chain.",
    "alpha.vseq.reconstructed": "Reconstructed variable domain sequence of TCR alpha chain using V/J gene annotations and CDR3 sequences.",

    "beta.cdr3": "Complementarity Determining Region 3 (CDR3) amino acid sequence of TCR beta chain.",
    "beta.v": "V gene segment of TCR beta chain.",
    "beta.j": "J gene segment of TCR beta chain.",
    "beta.vseq.reconstructed": "Reconstructed variable domain sequence of TCR beta chain using V/(D/)J gene annotations and CDR3 sequences.",

    "hla.allele": "HLA allele name.",
    "hla.full.seq": "Full HLA protein sequence.",
    "hla.clip.seq": "Amino acid sequence of HLA alpha-1 and alpha-2 domains, which is clipped from the full HLA protein sequence.",
    "hla.short.seq": "HLA pseudo sequence, extracted from HLA alpha-1 and alpha-2 domains."
}

level2components_dict = {
    "level-I": ["antigen.epitope", "beta.cdr3"],

    "level-II": ["antigen.epitope", "hla.allele", "beta.cdr3",
                "hla.full.seq", "hla.clip.seq", "hla.short.seq"],

    "level-III": ["antigen.epitope", "hla.allele", "alpha.cdr3", "beta.cdr3",
                "hla.full.seq", "hla.clip.seq", "hla.short.seq"],

    "level-IV": ["antigen.epitope", "hla.allele", 
                "alpha.v", "alpha.j","alpha.cdr3", 
                "beta.v", "beta.j", "beta.cdr3",
                "alpha.vseq.reconstructed", "beta.vseq.reconstructed",
                "hla.full.seq", "hla.clip.seq", "hla.short.seq"],
}

In [None]:
record_sets = []
for level, components in level2components_dict.items():
    record_set = mlc.RecordSet(
            id=level,
            name=level,
            # Each record has one or many fields...
            fields=[
                # Fields can be extracted from the FileObjects/FileSets.
                mlc.Field(
                    id=f"{level}/{component}",              # e.g. "level-I/antigen.epitope"
                    name=component,
                    description=components2desc_dict[component],
                    data_types=mlc.DataType.TEXT,
                    source=mlc.Source(
                        file_object=f"{level}.csv",
                        # Extract the field from the column of a FileObject/FileSet:
                        extract=mlc.Extract(column=component),      
                    ),
                ) 
                for component in components
            ],
        )
    print(record_set.fields)
    record_sets.append(record_set)

create metadata

In [4]:
metadata = mlc.Metadata(
    name="Hi-TPH",
    # Descriptions can contain plain text or markdown.
    description=(
        "A Large-Scale Hierarchical Dataset for TCR-pHLA Binding Prediction."
    ),
    cite_as=None,                   # None: to be updated
    license="https://creativecommons.org/licenses/by-nc/4.0/",
    version=None,
    date_published=None,
    url="https://github.com/Jiadong001/Hi-TpH",
    distribution=distribution,
    record_sets=record_sets,
)

In [None]:
print(metadata.issues.report())

Save croissant (metadata)

In [None]:
import json

with open("croissant.json", "w") as f:
    content = metadata.to_json()
    content = json.dumps(content, indent=2)
    print(content)
    f.write(content)
    f.write("\n")       # Terminate file with newline