# Data exploration


In [None]:
# %load_ext autoreload
# %autoreload 2

In [None]:
from kego.constants import PATH_DATA
from kego.files.files import list_files
import kego.plotting
import obonet
import pandas as pd
import polars as pl
import re
from Bio import SeqIO

In [None]:
list(list_files(PATH_DATA, return_absolute_path=True))

In [None]:
!head -3  '/home/kristian/projects/kego/data/cafa/cafa-6-protein-function-prediction/sample_submission.tsv'

In [None]:
!head -10 '/home/kristian/projects/kego/data/cafa/cafa-6-protein-function-prediction/Train/train_sequences.fasta'

In [None]:
!head  '/home/kristian/projects/kego/data/cafa/cafa-6-protein-function-prediction/Train/train_terms.tsv'

In [None]:
def load_fasta(filepath):
    with open(filepath) as fp:
        records = [
            {
                "database": record.name.split("|")[0],
                "EntryID": record.name.split("|")[1],
                "gene_name": record.name.split("|")[2],
                "sequence": str(record.seq),
            }
            for record in SeqIO.parse(fp, "fasta")
        ]

    df = pl.DataFrame(records)
    return df


sequences = load_fasta(
    "/home/kristian/projects/kego/data/cafa/cafa-6-protein-function-prediction/Train/train_sequences.fasta"
)
sequences

In [None]:
terms = pl.read_csv(
    "/home/kristian/projects/kego/data/cafa/cafa-6-protein-function-prediction/Train/train_terms.tsv",
    separator="\t",
)
terms

In [None]:
train = sequences.join(terms, on="EntryID")
train

In [None]:
print(
    f"There are many terms ({len(train["term"].unique())}) compared to total number of unique proteins ({len(train["sequence"].unique())})"
)

In [None]:
# as expected https://www.uniprot.org/uniprotkb/A0JNW5/entry
len(
    sequences.filter(pl.col("EntryID") == "A0JNW5")["sequence"]
    .to_numpy()
    .item()
)

In [None]:
_ = kego.plotting.plot_histogram(
    "sequence",
    df=train,
    font_size=12,
    log=["false", "log"],
    title="Counts of sequences with that many terms",
    label_x="Terms available in sequence",
    label_y="# of sequences",
)

In [None]:
train

In [None]:
!head '/home/kristian/projects/kego/data/cafa/cafa-6-protein-function-prediction/Train/train_taxonomy.tsv'

In [None]:
taxonomy = pl.read_csv(
    "/home/kristian/projects/kego/data/cafa/cafa-6-protein-function-prediction/Train/train_taxonomy.tsv",
    has_header=False,
    separator="\t",
    new_columns=["EntryID", "TaxonID"],
)
taxonomy = taxonomy.with_columns(pl.col("TaxonID").cast(pl.String))
taxonomy

In [None]:
train = train.join(taxonomy, on="EntryID")
train

In [None]:
kego.plotting.plot_histogram("TaxonID", df=train, log=("log", "false"))

In [None]:
!head '/home/kristian/projects/kego/data/cafa/cafa-6-protein-function-prediction/IA.tsv'

In [None]:
ia = pl.read_csv(
    "/home/kristian/projects/kego/data/cafa/cafa-6-protein-function-prediction/IA.tsv",
    separator="\t",
    has_header=False,
    new_columns=["EntryID", "weight"],
)
ia

In [None]:
train.write_csv("train.csv")

In [None]:
ia.write_csv("ia.csv")

In [None]:
!head '/home/kristian/projects/kego/data/cafa/cafa-6-protein-function-prediction/Test/testsuperset.fasta'

In [None]:
def load_test(filepath):
    with open(filepath) as fp:
        records = [
            {
                "EntryID": record.description.split(" ")[0],
                "TaxonID": record.description.split(" ")[1],
                "sequence": str(record.seq),
            }
            for record in SeqIO.parse(fp, "fasta")
        ]

    df = pl.DataFrame(records)
    return df


test = load_test(
    "/home/kristian/projects/kego/data/cafa/cafa-6-protein-function-prediction/Test/testsuperset.fasta"
)

In [None]:
test.write_csv("test.csv")

In [None]:
224_309 * 1500