In [17]:
import pandas as pd
import math 
from collections import defaultdict 

In [18]:
df = pd.read_csv("../data/SPARCLE_IDS_curated_simplified.csv", on_bad_lines  = "warn")

In [19]:
df.columns

Index(['ArchId', 'CurVer', 'CurLabel', 'CurName', 'CurName_simplified',
       'superfamilyarch', 'SpecificArch', 'TitleStrings', 'Taxid', 'IsSpec',
       'Status', 'LabelState', 'archLen_max', 'archLen_med', 'ArchId_string',
       'SuperFamID_string'],
      dtype='object')

In [20]:
"""
The code processes rows in a DataFrame, generating two dictionaries (dict_feat and dict_idx_feat). 
dict_feat contains feature keys derived from specific columns, while dict_idx_feat is a nested dictionary 
associating row indices with feature keys and binary values indicating their presence in each row.
"""
dict_feat = defaultdict() 
dict_archID = defaultdict() 
dict_idx_feat =  defaultdict(dict) 

for index, row in df.iterrows():
    #if type(row["Shortnames"]) == str:
    #    dict_feat["Shortnames_" + row["Shortnames"]] = ""
    #    dict_idx_feat[index]["Shortnames_" + row["Shortnames"]] = 1
    archID = row["ArchId"]
    dict_archID[archID] = row["CurName_simplified"]
    if type(row["SpecificArch"]) == str:
        id_list = row["SpecificArch"].split(" ")
        for id in id_list: 
            dict_feat["SpecificArch_" + id] = ""
            dict_idx_feat[archID]["SpecificArch_" + id] = 1
    if type(row["superfamilyarch"]) == str:
        dict_feat["superfamilyarch_" + row["superfamilyarch"]] = ""
        dict_idx_feat[archID]["superfamilyarch_" + row["superfamilyarch"]] = 1

In [21]:
output_mtx = "../data/SPARCLE_IDS_curated_simplified.mtx4ml.tsv"

In [22]:
"""
The code writes a tab-separated matrix file (output_mtx) where the first row consists 
of the "CurName" column and feature names obtained from a dictionary (dict_feat).
Subsequently, for each row in the DataFrame (df), it writes a line containing the 
"CurName" value followed by binary-encoded feature values, with missing features set to 0.
"""
with open(output_mtx, "w") as mtx:
    header = ["CurName_simplified"] + sorted(dict_feat.keys())
    mtx.write("\t".join(header) + "\n")

    for archID in dict_archID:
        cur_name = dict_archID[archID]
        ele = [cur_name] + [str(dict_idx_feat[archID].get(item, 0)) for item in sorted(dict_feat.keys())]
        mtx.write("\t".join(ele) + "\n")

In [23]:
# Number of features
len(dict_feat.keys())

48296

In [24]:
%%bash 
pip freeze 

annotated-types==0.6.0
anyio==4.3.0
appnope==0.1.4
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
arrow==1.3.0
asttokens==2.4.1
async-lru==2.0.4
attrs==23.2.0
Babel==2.14.0
beautifulsoup4==4.12.3
bleach==6.1.0
blis==0.7.11
catalogue==2.0.10
certifi==2024.2.2
cffi==1.16.0
charset-normalizer==3.3.2
click==8.1.7
cloudpathlib==0.16.0
comm==0.2.1
confection==0.1.4
cymem==2.0.8
debugpy==1.8.1
decorator==5.1.1
defusedxml==0.7.1
en-core-web-md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl#sha256=6a0f857a2b4d219c6fa17d455f82430b365bf53171a2d919b9376e5dc9be032e
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
exceptiongroup==1.2.0
executing==2.0.1
fastjsonschema==2.19.1
fqdn==1.5.1
h11==0.14.0
httpcore==1.0.4
httpx==0.27.0
idna==3.6
importlib-metadata==7.0.1
ipyke