In [1]:
import sys
import pandas as pd
import ankh
import os
import requests
import h5py

In [2]:
def load_dict_from_hdf5(filename):
    """
    Load a dictionary with string keys and NumPy array values from an HDF5 file.

    Parameters:
    filename (str): Name of the HDF5 file to load the data from.

    Returns:
    dict: Dictionary with string keys and NumPy array values.
    """
    loaded_dict = {}
    with h5py.File(filename, "r") as f:
        for key in f.keys():
            loaded_dict[key] = f[key][:]
    return loaded_dict

In [3]:
sys.path.append("..")

In [4]:
from scripts.utils import convert_fasta_to_df, write_fasta, filter_df, prepare_neg_samples

In [4]:
df = pd.read_csv("../data/external/trembl_test.csv")

In [5]:
df = convert_fasta_to_df("../data/not_annotated/raw_fasta/merged.fasta")

In [6]:
go_no_annot = prepare_neg_samples("../data/not_annotated/not_annotated.yml")

In [14]:
out_df = df.merge(go_no_annot, on="identifier")

In [12]:
test_1 = pd.read_csv("../data/splits/test_trembl_MF.csv")

In [14]:
test_1.label.value_counts()

0    282
1    159
Name: label, dtype: int64

In [11]:
df = pd.read_csv("../data/processed/pdb20000_taxon.csv")

In [12]:
df.kingdom.value_counts()

Bacteria         10958
Metazoa           4290
Viridiplantae     1903
Fungi             1640
Archaea            523
Protists           362
Viruses            309
Name: kingdom, dtype: int64

In [13]:
df

Unnamed: 0,identifier,organism,kingdom
0,P05102,Haemophilus parahaemolyticus,Bacteria
1,P0CY10,Saccharomyces cerevisiae,Fungi
2,P02302,Xenopus laevis,Metazoa
3,Q00422,Mus musculus,Metazoa
4,P13123,Sulfolobus acidocaldarius (strain ATCC 33909 /...,Archaea
...,...,...,...
19995,P53107,Saccharomyces cerevisiae (strain ATCC 204508 /...,Fungi
19996,Q12172,Saccharomyces cerevisiae (strain ATCC 204508 /...,Fungi
19997,P0ACN2,Escherichia coli (strain K12),Bacteria
19998,Q9JMT8,Escherichia coli (strain K12),Bacteria


In [71]:
df.drop(columns=["cluster"], axis=1, inplace=True)

In [7]:
df.drop(columns=["sequence", "label"], axis=1, inplace=True)

In [14]:
df.to_csv("../data/taxon/pdb20000.csv", index=False)

In [9]:
embed_df = load_dict_from_hdf5("../../../../../ssd2/dbp_finder/ankh_embeddings/pdb2272_2d.h5")

In [10]:
embed_df.keys()

dict_keys(['O00287', 'O00488', 'O00571', 'O00841', 'O04006', 'O05023', 'O05052', 'O05103', 'O05396', 'O05441', 'O05519', 'O05977', 'O06719', 'O06987', 'O07127', 'O07402', 'O07458', 'O07634', 'O07827', 'O08976', 'O10308', 'O10360', 'O10368', 'O13339', 'O13360', 'O13472', 'O13606', 'O13705', 'O13767', 'O13807', 'O13862', 'O13988', 'O14009', 'O14017', 'O14096', 'O14746', 'O15315', 'O16810', 'O17582', 'O22010', 'O24606', 'O27318', 'O27652', 'O27941', 'O28069', 'O28078', 'O28092', 'O28145', 'O28146', 'O28150', 'O28160', 'O28251', 'O28269', 'O28280', 'O28286', 'O28420', 'O28561', 'O28575', 'O28640', 'O28675', 'O28773', 'O28838', 'O28985', 'O29292', 'O29347', 'O29506', 'O29560', 'O29749', 'O29831', 'O29848', 'O30138', 'O30332', 'O30477', 'O31178', 'O31216', 'O31454', 'O31663', 'O31690', 'O31697', 'O31797', 'O31829', 'O32138', 'O32255', 'O32272', 'O32333', 'O33060', 'O33683', 'O33813', 'O33817', 'O34162', 'O34621', 'O34827', 'O34887', 'O34926', 'O35003', 'O35892', 'O42131', 'O43037', 'O43102',

In [13]:
import numpy as np

In [14]:
for key in embed_df:
    embed_df[key] = np.squeeze(embed_df[key])

In [16]:
embed_df["O00841"].shape

(791, 1536)

In [17]:
def save_dict_to_hdf5(data_dict, filename):
    """
    Save a dictionary with string keys and NumPy array values to an HDF5 file.

    Parameters:
    data_dict (dict): Dictionary with string keys and NumPy array values.
    filename (str): Name of the HDF5 file to save the data.
    """
    with h5py.File(filename, "w") as f:
        for key, value in data_dict.items():
            f.create_dataset(key, data=value)

In [18]:
save_dict_to_hdf5(embed_df, "../../../../../ssd2/dbp_finder/ankh_embeddings/pdb2272_2d.h5")

In [15]:
score_df = pd.read_csv("../data/not_annotated/predictions.csv")
id_df = pd.read_csv("../data/not_annotated/not_annotated_seqs.csv")
taxon_df = pd.read_csv("../data/taxon/not_annotated_seqs.csv")

In [16]:
score_df = score_df.merge(id_df, on="identifier")
prediction_df = score_df.merge(taxon_df, on="identifier")

In [20]:
prediction_df = prediction_df.sort_values(by=["score"], ascending=False)

In [28]:
prediction_df.organism.unique()

array(['Bacillus subtilis (strain 168)', 'Arabidopsis thaliana',
       'Escherichia coli (strain K12)', 'Homo sapiens',
       'Komagataella pastoris',
       'Saccharomyces cerevisiae (strain ATCC 204508 / S288c)', nan],
      dtype=object)

In [29]:
grouped = prediction_df.groupby(prediction_df['organism'].astype(str))
thresholds = [0.5, 0.6, 0.7, 0.8, 0.9]
stats = {}
for name, sub_df in grouped:
    for threshold in thresholds:
        count = len(sub_df[sub_df["score"] > threshold])
        group_name = name + "_" + str(threshold)
        stats[group_name] = count

In [34]:
prediction_df

Unnamed: 0,identifier,score,sequence,organism,kingdom
57449,O07917,9.999958e-01,MAYVKATAILPEKLISEIQKYVQGKTIYIPKPESSHQKWGACSGTR...,Bacillus subtilis (strain 168),Bacteria
9804,A0A1I9LSP1,9.999943e-01,MNNNIFSTTTTINDDYMLFPYNDHYSSQPLLPFSPSSSINDILIHS...,Arabidopsis thaliana,Viridiplantae
60600,P0AD37,9.999942e-01,MTPDELARLTGYSRQTINKWVRKEGWTTSPKPGVQGGKARLVHVNE...,Escherichia coli (strain K12),Bacteria
11514,A0A1P8AWJ0,9.999926e-01,MFPSLDTNGYDLFDPFIPHQTTMFPSFITHIQSPNSHHHYSSPSFP...,Arabidopsis thaliana,Viridiplantae
58007,O31637,9.999917e-01,MTDQMIAWEIEEWIRDYKFMLREIKRLNRVLNKVDFISTKLTATYG...,Bacillus subtilis (strain 168),Bacteria
...,...,...,...,...,...
57838,O23693,5.071109e-07,MEHMMKEGRSLAETPTYSVASVVTVLVFVCFLVERAIYRFGKWLKK...,Arabidopsis thaliana,Viridiplantae
81199,Q9LUH3,4.928522e-07,MADPTSKDDHDGEGGRDKSSTFVQKLIDVEEAKTQIIYSLPMIFTN...,Arabidopsis thaliana,Viridiplantae
10842,A0A1P8AS85,4.733958e-07,MFMNQILLLLHQDPQIAELAGVYCLWLVPALFGYSVLESLVRYFQS...,Arabidopsis thaliana,Viridiplantae
35978,E7EX57,4.384196e-07,MEAPEEPAPVRGGPEATLEVRGSRCLRLSAFREELRALLVLAGPAF...,Homo sapiens,Metazoa


In [35]:
prediction_df.to_csv("../data/not_annotated/predictions.csv", index=False)