In [38]:
import pandas as pd
import math 
from collections import defaultdict 


In [6]:
df = pd.read_csv("../data/SPARCLE_IDS_curated.csv", on_bad_lines  = "warn")

In [7]:
df.columns

Index(['ArchId', 'CurVer', 'CurLabel', 'CurName', 'Shortnames', 'SpecificArch',
       'superfamilyarch', 'Taxid', 'IsSpec', 'Status', 'LabelState',
       'archLen_max', 'archLen_med', 'ArchId_string', 'SuperFamID_string',
       'TitleStrings'],
      dtype='object')

In [78]:
"""
The code processes rows in a DataFrame, generating two dictionaries (dict_feat and dict_idx_feat). 
dict_feat contains feature keys derived from specific columns, while dict_idx_feat is a nested dictionary 
associating row indices with feature keys and binary values indicating their presence in each row.
"""
dict_feat = defaultdict() 
dict_idx_feat =  defaultdict(dict) 

for index, row in df.iterrows():
    #if type(row["Shortnames"]) == str:
    #    dict_feat["Shortnames_" + row["Shortnames"]] = ""
    #    dict_idx_feat[index]["Shortnames_" + row["Shortnames"]] = 1
    if type(row["SpecificArch"]) == str:
        id_list = row["SpecificArch"].split(" ")
        for id in id_list: 
            dict_feat["SpecificArch_" + id] = ""
            dict_idx_feat[index]["SpecificArch_" + id] = 1
    if type(row["superfamilyarch"]) == str:
        dict_feat["superfamilyarch_" + row["superfamilyarch"]] = ""
        dict_idx_feat[index]["superfamilyarch_" + row["superfamilyarch"]] = 1

In [79]:
output_mtx = "../data/SPARCLE_IDS_curated.mtx4ml.tsv"

In [80]:
"""
The code writes a tab-separated matrix file (output_mtx) where the first row consists 
of the "CurName" column and feature names obtained from a dictionary (dict_feat).
Subsequently, for each row in the DataFrame (df), it writes a line containing the 
"CurName" value followed by binary-encoded feature values, with missing features set to 0.
"""
with open(output_mtx, "w") as mtx:
    header = ["CurName"] + sorted(dict_feat.keys())
    mtx.write("\t".join(header) + "\n")

    for idx, cur_name in enumerate(df["CurName"]):
        ele = [cur_name] + [str(dict_idx_feat[idx].get(item, 0)) for item in sorted(dict_feat.keys())]
        mtx.write("\t".join(ele) + "\n")

In [81]:
# Number of features
len(dict_feat.keys())

48296

In [82]:
%%bash 
pip freeze 

anyio==4.3.0
appnope==0.1.4
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
arrow==1.3.0
asttokens==2.4.1
async-lru==2.0.4
attrs==23.2.0
Babel==2.14.0
beautifulsoup4==4.12.3
bleach==6.1.0
certifi==2024.2.2
cffi==1.16.0
charset-normalizer==3.3.2
comm==0.2.1
debugpy==1.8.1
decorator==5.1.1
defusedxml==0.7.1
exceptiongroup==1.2.0
executing==2.0.1
fastjsonschema==2.19.1
fqdn==1.5.1
h11==0.14.0
httpcore==1.0.4
httpx==0.27.0
idna==3.6
importlib-metadata==7.0.1
ipykernel==6.29.3
ipython==8.18.1
isoduration==20.11.0
jedi==0.19.1
Jinja2==3.1.3
json5==0.9.17
jsonpointer==2.4
jsonschema==4.21.1
jsonschema-specifications==2023.12.1
jupyter-events==0.9.0
jupyter-lsp==2.2.3
jupyter_client==8.6.0
jupyter_core==5.7.1
jupyter_server==2.12.5
jupyter_server_terminals==0.5.2
jupyterlab==4.1.2
jupyterlab_pygments==0.3.0
jupyterlab_server==2.25.3
MarkupSafe==2.1.5
matplotlib-inline==0.1.6
mistune==3.0.2
nbclient==0.9.0
nbconvert==7.16.1
nbformat==5.9.2
nest-asyncio==1.6.0
notebook==7.1.1
notebook_shim==0.2