In [1]:
import pandas as pd
import numpy as np

In [2]:
path_to_test_data = "../tests/data/test_data.tsv"
path_to_apple_data = "../data/uniprotkb_organism_id_3750_2024_01_19.tsv"

## Inspect columns

In [3]:
selected_columns = [
    "Entry",
    "Protein names",
    "Gene Names",
    "Organism",
    "Organism (ID)",
    "BRENDA",
    "Gene Ontology IDs",
]
df = pd.read_csv(path_to_apple_data, sep="\t", usecols=selected_columns)
df.head()

  df = pd.read_csv(path_to_apple_data, sep="\t", usecols=selected_columns)


Unnamed: 0,Entry,Protein names,Gene Names,Organism,Organism (ID),Gene Ontology IDs,BRENDA
0,A0A0E3T3B5,"Aminoaldehyde dehydrogenase 2, peroxisomal (Md...",AMADH2,Malus domestica (Apple) (Pyrus malus),3750,GO:0005777; GO:0019145; GO:0019285; GO:0033737...,1.2.1.19;
1,A0A0E3T552,"Aminoaldehyde dehydrogenase 1, peroxisomal (Md...",AMADH1,Malus domestica (Apple) (Pyrus malus),3750,GO:0005829; GO:0019145; GO:0019285; GO:0033737...,1.2.1.19;
2,P37821,1-aminocyclopropane-1-carboxylate synthase (AC...,ACS-1,Malus domestica (Apple) (Pyrus malus),3750,GO:0009693; GO:0009835; GO:0016491; GO:0016847...,4.4.1.14;
3,Q64FJ6,Alcohol acyl transferase 1 allele RGa (AAT1-RG...,AAT1RGA AAT1,Malus domestica (Apple) (Pyrus malus),3750,GO:0006066; GO:0009723; GO:0009836; GO:0016746,
4,Q84LB2,"(E,E)-alpha-farnesene synthase (MdASF1) (EC 4....",AFS1 AFAR,Malus domestica (Apple) (Pyrus malus),3750,GO:0000287; GO:0005737; GO:0016102; GO:0016740...,4.2.3.46;


In [4]:
for column in selected_columns:
    print(f"{column} NaN: {df[column].isna().any()}")

Entry NaN: False
Protein names NaN: False
Gene Names NaN: True
Organism NaN: False
Organism (ID) NaN: False
BRENDA NaN: True
Gene Ontology IDs NaN: True


## Add NaN row to test data and modify some columns

In [5]:
test_df = df[df["BRENDA"].notna()].head(2)

In [6]:
new_test_row = {
    "Entry": "test_entry",
    "Protein names": "test_name",
    "Gene Names": np.nan,
    "Organism": "Malus domestica",
    "Organism (ID)": 3750,
    "EC number": np.nan,
    "Gene Ontology (GO)": np.nan,
    "Gene Ontology IDs": np.nan,
}

test_df.loc[len(test_df.index)] = new_test_row

In [7]:
test_df.loc[0, "BRENDA"] = "EC.1; 1.1.1.1;"

In [8]:
test_df.loc[0, "Gene Names"] = "Gene1 Gene2 Gene3"

In [9]:
test_df

Unnamed: 0,Entry,Protein names,Gene Names,Organism,Organism (ID),Gene Ontology IDs,BRENDA
0,A0A0E3T3B5,"Aminoaldehyde dehydrogenase 2, peroxisomal (Md...",Gene1 Gene2 Gene3,Malus domestica (Apple) (Pyrus malus),3750,GO:0005777; GO:0019145; GO:0019285; GO:0033737...,EC.1; 1.1.1.1;
1,A0A0E3T552,"Aminoaldehyde dehydrogenase 1, peroxisomal (Md...",AMADH1,Malus domestica (Apple) (Pyrus malus),3750,GO:0005829; GO:0019145; GO:0019285; GO:0033737...,1.2.1.19;
2,test_entry,test_name,,Malus domestica,3750,,


In [10]:
test_df.to_csv(path_to_test_data, sep="\t")

## Load test data

In [6]:
test_df = pd.read_csv(path_to_test_data, sep="\t", index_col=0, usecols=selected_columns)
test_df

Unnamed: 0_level_0,Protein names,Gene Names,Organism,Proteomes,Virus hosts,Fragment,Gene encoded by,Natural variant,Polymorphism,Active site,...,Temperature dependence,Gene Ontology (GO),Gene Ontology IDs,Topological domain,PubMed ID,DOI ID,Protein families,STRING,UniPathway,BRENDA
Entry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0A0E3T3B5,"Aminoaldehyde dehydrogenase 2, peroxisomal (Md...",Gene1 Gene2 Gene3,Malus domestica (Apple) (Pyrus malus),,,,,,,"ACT_SITE 260; /note=""Proton acceptor""; /eviden...",...,,peroxisome [GO:0005777]; 1-pyrroline dehydroge...,GO:0005777; GO:0019145; GO:0019285; GO:0033737...,,26296314.0,10.1016/j.febslet.2015.08.005,Aldehyde dehydrogenase family,,UPA00529;,EC.1; 1.2.1.19;
A0A0E3T552,"Aminoaldehyde dehydrogenase 1, peroxisomal (Md...",AMADH1,Malus domestica (Apple) (Pyrus malus),,,,,,,"ACT_SITE 260; /note=""Proton acceptor""; /eviden...",...,,cytosol [GO:0005829]; 1-pyrroline dehydrogenas...,GO:0005829; GO:0019145; GO:0019285; GO:0033737...,,26296314.0,10.1016/j.febslet.2015.08.005,Aldehyde dehydrogenase family,,UPA00529;,1.2.1.19;
test_entry,test_name;,,apple,,,,,,,,...,,,,,,,,,,
