# Data exploration


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from kego.constants import PATH_DATA
from kego.files.files import list_files
import kego.plotting
import obonet
import networkx
import numpy as np
import pandas as pd
import polars as pl
import os
import re
from Bio import SeqIO

In [None]:
list(list_files(PATH_DATA / "cafa", return_absolute_path=True))

In [None]:
!head -3  '/home/kristian/projects/kego/data/cafa/cafa-6-protein-function-prediction/sample_submission.tsv'

In [None]:
!head -10 '/home/kristian/projects/kego/data/cafa/cafa-6-protein-function-prediction/Train/train_sequences.fasta'

In [None]:
!head  '/home/kristian/projects/kego/data/cafa/cafa-6-protein-function-prediction/Train/train_terms.tsv'

In [None]:
def load_fasta(filepath):
    with open(filepath) as fp:
        records = [
            {
                "database": record.name.split("|")[0],
                "EntryID": record.name.split("|")[1],
                "gene_name": record.name.split("|")[2],
                "sequence": str(record.seq),
            }
            for record in SeqIO.parse(fp, "fasta")
        ]

    df = pl.DataFrame(records)
    return df


sequences = load_fasta(
    f"{os.environ['HOME']}/projects/kego/data/cafa/cafa-6-protein-function-prediction/Train/train_sequences.fasta"
)
sequences

In [None]:
terms = pl.read_csv(
    f"{os.environ['HOME']}/projects/kego/data/cafa/cafa-6-protein-function-prediction/Train/train_terms.tsv",
    separator="\t",
)
terms

In [None]:
train = sequences.join(terms, on="EntryID")
train

In [None]:
print(
    f"There are many terms ({len(train["term"].unique())}) compared to total number of unique proteins ({len(train["sequence"].unique())})"
)

In [None]:
# as expected https://www.uniprot.org/uniprotkb/A0JNW5/entry
len(sequences.filter(pl.col("EntryID") == "A0JNW5")["sequence"].to_numpy().item())

In [None]:
_ = kego.plotting.plot_histogram(
    "sequence",
    df=train,
    font_size=12,
    log=["false", "log"],
    title="Counts of sequences with that many terms",
    label_x="Terms available in sequence",
    label_y="# of sequences",
)

In [None]:
train

In [None]:
!head '/home/kristian/projects/kego/data/cafa/cafa-6-protein-function-prediction/Train/train_taxonomy.tsv'

In [None]:
taxonomy = pl.read_csv(
    f"{os.environ['HOME']}/projects/kego/data/cafa/cafa-6-protein-function-prediction/Train/train_taxonomy.tsv",
    has_header=False,
    separator="\t",
    new_columns=["EntryID", "TaxonID"],
)
taxonomy = taxonomy.with_columns(pl.col("TaxonID").cast(pl.String))
taxonomy

In [None]:
train = train.join(taxonomy, on="EntryID")
train

In [None]:
kego.plotting.plot_histogram("TaxonID", df=train, log=("log", "false"))

In [None]:
!head '/home/kristian/projects/kego/data/cafa/cafa-6-protein-function-prediction/IA.tsv'

In [None]:
ia = pl.read_csv(
    f"{os.environ['HOME']}/projects/kego/data/cafa/cafa-6-protein-function-prediction/IA.tsv",
    separator="\t",
    has_header=False,
    new_columns=["EntryID", "weight"],
)
ia

In [None]:
train.write_csv("train.csv")

In [None]:
ia.write_csv("ia.csv")

In [None]:
!head '/home/kristian/projects/kego/data/cafa/cafa-6-protein-function-prediction/Test/testsuperset.fasta'

In [None]:
def load_test(filepath):
    with open(filepath) as fp:
        records = [
            {
                "EntryID": record.description.split(" ")[0],
                "TaxonID": record.description.split(" ")[1],
                "sequence": str(record.seq),
            }
            for record in SeqIO.parse(fp, "fasta")
        ]

    df = pl.DataFrame(records)
    return df


test = load_test(
    f"{os.environ['HOME']}/projects/kego/data/cafa/cafa-6-protein-function-prediction/Test/testsuperset.fasta"
)

In [None]:
test.write_csv("test.csv")

In [None]:
test

In [None]:
224_309 * 1500

In [None]:
graph = obonet.read_obo(
    "/Users/kristianehlert/projects/kego/data/cafa/cafa-6-protein-function-prediction/Train/go-basic.obo"
)

In [None]:
graph.size()

In [None]:
id_to_name = {id_: data.get("name") for id_, data in graph.nodes(data=True)}
name_to_id = {
    data["name"]: id_ for id_, data in graph.nodes(data=True) if "name" in data
}

In [None]:
id_to_name["GO:0006513"]

In [None]:
# Find edges to parent terms
node = name_to_id["pilus"]
for child, parent, key in graph.out_edges(node, keys=True):
    print(f"• {id_to_name[child]} ⟶ {key} ⟶ {id_to_name[parent]}")

In [None]:
# Find edges to children terms
node = name_to_id["pilus"]
for parent, child, key in graph.in_edges(node, keys=True):
    print(f"• {id_to_name[child]} ⟵ {key} ⟵ {id_to_name[parent]}")

In [None]:
paths = networkx.all_simple_paths(
    graph, source=name_to_id["starch binding"], target=name_to_id["molecular_function"]
)
for path in paths:
    print("•", " ⟶ ".join(id_to_name[node] for node in path))

In [None]:
list(graph.successors("GO:0006513")), graph.is_directed()

In [None]:
networkx.is_directed_acyclic_graph(graph)

In [None]:
import sklearn.ensemble
import sklearn.feature_extraction
import sklearn.decomposition

In [None]:
mixin = sklearn.feature_extraction.text.HashingVectorizer(n_features=10000)
sequences_encoded = mixin.fit_transform(train["sequence"])

In [None]:
kego.plotting.plot_scatter(y=sequences_encoded.toarray()[1012])

In [None]:
sequence_sample = (train["sequence"].value_counts().sort(by="count", descending=True))[
    "sequence"
].to_numpy()[0]

In [None]:
len(sequence_sample)

In [None]:
class Dataset:
    def __init__(
        self,
        df,
        name="cafa",
        description="CAFA dataset",
        target="label",
        features: list[str] | None = ["text"],
    ):
        self.df = df
        self.name = name
        self.description = description
        self.target = target
        self.features = features

    def __repr__(self):
        return f"{self.name}: {self.description}"

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        return self.df.iloc[index]

    def __getattr__(self, name):
        if name in self.df.columns:
            return self.df[name]
        else:
            raise AttributeError(f"{name=} not found in {self.df.columns=}")


Dataset(
    df=train,
    name="train",
    description="Training set",
    target="label",
    features=["text"],
)

In [None]:
from transformers import BertModel, BertTokenizer
import re

tokenizer = BertTokenizer.from_pretrained(
    "/Users/kristianehlert/projects/kego/model_data/prot_bert", do_lower_case=False
)
model = BertModel.from_pretrained(
    "/Users/kristianehlert/projects/kego/model_data/prot_bert"
)
sequence_Example = "A E T C Z A O"
sequence_Example = re.sub(r"[UZOB]", "X", sequence_Example)
encoded_input = tokenizer(sequence_Example, return_tensors="pt")
output = model(**encoded_input)
output = output[0]
print(output)