In [1]:
import pandas as pd
from functools import partial
from localcider.sequenceParameters import SequenceParameters
from pathlib import Path
from sklearn.preprocessing import MultiLabelBinarizer
from collections import defaultdict, Counter

**Import UniProt proteome**

In [2]:
uniprot_data_all = pd.read_csv('data/uniprot_data/uniprotkb_taxonomy_id_10090_2025_03_25.tsv', sep='\t')
uniprot_data_all

  uniprot_data_all = pd.read_csv('/ceph/hpc/home/novljanj/data_storage/projects/nucleolus_enriched_proteins/data/uniprot_data/uniprotkb_taxonomy_id_10090_2025_03_25.tsv', sep='\t')


Unnamed: 0,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length,Compositional bias,Mass,Sequence,...,Redox potential,Region,Ensembl,Coiled coil,Domain [CC],Domain [FT],Motif,Repeat,Zinc finger,Gene Ontology (cellular component)
0,A0A075F5C6,unreviewed,A0A075F5C6_MOUSE,Heat shock factor 1 (Heat shock transcription ...,Hsf1,Mus musculus (Mouse),531,"COMPBIAS 343..355; /note=""Polar residues""; /ev...",57879,MDLAVGPGAAGPSNVPAFLTKLWTLVSDPDTDALICWSPSGNSFHV...,...,,"REGION 272..327; /note=""Disordered""; /evidence...",ENSMUST00000228371.2;,,,"DOMAIN 57..81; /note=""HSF-type DNA-binding""; /...",,,,centrosome [GO:0005813]; cytosol [GO:0005829];...
1,A0A087WPF7,reviewed,AUTS2_MOUSE,Autism susceptibility gene 2 protein homolog,Auts2 Kiaa0442,Mus musculus (Mouse),1261,"COMPBIAS 8..17; /note=""Basic residues""; /evide...",138920,MDGPTRGHGLRKKRRSRSQRDRERRSRAGLGTGAAGGIGAGRTRAP...,...,,"REGION 1..88; /note=""Disordered""; /evidence=""E...",ENSMUST00000161226.11 [A0A087WPF7-1];ENSMUST00...,,DOMAIN: The Pro-rich region is important for t...,,,,,cytoplasm [GO:0005737]; cytoskeleton [GO:00058...
2,A0A087WPT2,unreviewed,A0A087WPT2_MOUSE,Prostaglandin-endoperoxide synthase 2,Ptgs2,Mus musculus (Mouse),62,,7199,MSTGFDQYKCDCTRTGFYGENCTTPEFLTRIKLLLKPTPNTVHYIL...,...,,,ENSMUST00000190784.2;,,,"DOMAIN 1..23; /note=""EGF-like""; /evidence=""ECO...",,,,
3,A0A087WRK1,unreviewed,A0A087WRK1_MOUSE,"Predicted gene, 20814 (Predicted gene, 20855) ...",Gm20905 Gm20814 Gm20835 Gm20850 Gm20855 Gm2086...,Mus musculus (Mouse),222,,25620,MRRMALKKLKVIPKEGYLLLLDFDDEDDDIKVSEEALSEVKSPAFD...,...,,"REGION 39..64; /note=""Disordered""; /evidence=""...",ENSMUST00000185240.2;ENSMUST00000185245.2;ENSM...,,,"DOMAIN 112..220; /note=""XLR/SYCP3/FAM9""; /evid...",,,,cytoplasm [GO:0005737]; nucleus [GO:0005634]
4,A0A087WRT4,unreviewed,A0A087WRT4_MOUSE,FAT atypical cadherin 1,Fat1,Mus musculus (Mouse),4602,"COMPBIAS 4258..4267; /note=""Polar residues""; /...",507531,MGRHLTLLLLLLLFLQQFGDSDGSQRLEPTPPIQFTHFQYNVTVHE...,...,,"REGION 4256..4292; /note=""Disordered""; /eviden...",ENSMUST00000189017.8;,,,"DOMAIN 36..150; /note=""Cadherin""; /evidence=""E...",,,,apical plasma membrane [GO:0016324]; cell junc...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87487,Z4YN82,unreviewed,Z4YN82_MOUSE,Syntaphilin,Snph,Mus musculus (Mouse),87,,9364,MAMSLQGSRRASAGSRRRTSPPVSVRDAYGTSSLSSSSNSGSCKGS...,...,,"REGION 1..74; /note=""Disordered""; /evidence=""E...",ENSMUST00000137936.2;,,,,,,,membrane [GO:0016020]
87488,Z4YN86,unreviewed,Z4YN86_MOUSE,Translocase of inner mitochondrial membrane 22,Timm22,Mus musculus (Mouse),116,,12689,MAATAPKAGGSAPEAAGSAEAPLQYSLLLQYLVGDKRQPRLLEPGS...,...,,,ENSMUST00000152183.2;,,,,,,,
87489,Z4YN92,unreviewed,Z4YN92_MOUSE,A kinase anchor protein 17B,Akap17b,Mus musculus (Mouse),406,"COMPBIAS 24..52; /note=""Basic and acidic resid...",47765,MFDTTKHFSEGAIQRRNQERLKLQELEEERKKEKKREEEVAERKRK...,...,,"REGION 24..90; /note=""Disordered""; /evidence=""...",ENSMUST00000133980.2;,,,,,,,
87490,Z4YN97,unreviewed,Z4YN97_MOUSE,Adenylate kinase 1,Ak1,Mus musculus (Mouse),89,,9541,MEEKLKKAKIIFVVGGPGSGKGTQCEKIVQKYGYTHLSTGDLLRAE...,...,,,ENSMUST00000156578.8;,,,,,,,cytoplasm [GO:0005737]


**Select the enriched proteins in each nucleolar compartments based on p-value cutoff**

In [4]:
proteome_data = pd.read_excel("data/proteome_data/Df.res_Nucleolar dataset_20250210.xlsx")

S45_genes = set(proteome_data[(proteome_data["lab5_S47.min.p"] < 0.05) & (proteome_data["lab5_S47.min"] > 0.0)]["Majority.protein.IDs"].tolist())
ITS2_genes = set(proteome_data[(proteome_data["lab5_ITS2.min.p"] < 0.05) & (proteome_data["lab5_ITS2.min"] > 0.0)]["Majority.protein.IDs"].tolist())

print("S45_genes:   ", len(S45_genes))
print("ITS2_genes:  ", len(ITS2_genes))

S47_genes_vs_S45 = set(proteome_data[(proteome_data["lab5_S47.min.p"] < 0.05) & (proteome_data["lab5_S47.min"] < 0.0)]["Majority.protein.IDs"].tolist())
S47_genes_vs_ITS2  = set(proteome_data[(proteome_data["lab5_ITS2.min.p"] < 0.05) & (proteome_data["lab5_ITS2.min"] < 0.0)]["Majority.protein.IDs"].tolist())
print("S47_genes_vs_S45: ", len(S47_genes_vs_S45))
print("S47_genes_vs_ITS2: ", len(S47_genes_vs_ITS2))

S47_genes = S47_genes_vs_S45.intersection(S47_genes_vs_ITS2)
print("S47_genes:   ", len(S47_genes))

S45_genes:    84
ITS2_genes:   75
S47_genes_vs_S45:  182
S47_genes_vs_ITS2:  393
S47_genes:    118


**Clean up the dataframe and format**

In [5]:
rows = []

lists = [S45_genes, S47_genes]
names = ["S45", "S47"]

for i, l in enumerate(lists):
    for gene in l:
        if ";" in gene:
            gene_sep = gene.split(";")
            for g in gene_sep:
                if g not in uniprot_data_all["Entry"].values:
                    print(f"{g} not in uniprot data")
                    continue

                rows.append({
                    "Majority.protein.IDs": gene,
                    "Reviewed": uniprot_data_all[uniprot_data_all["Entry"] == g]["Reviewed"].values[0],
                    "Protein ID": g,
                    "Sequence": uniprot_data_all[uniprot_data_all["Entry"] == g]["Sequence"].values[0],
                    "Label": names[i],
                })
        else:
            g = gene
            if g not in uniprot_data_all["Entry"].values:
                print(f"{g} not in uniprot data")
                continue
            seq = uniprot_data_all[uniprot_data_all["Entry"] == g]["Sequence"].values[0]
            reviewed = uniprot_data_all[uniprot_data_all["Entry"] == g]["Reviewed"].values[0]
            rows.append({
                "Majority.protein.IDs": gene,
                "Reviewed": uniprot_data_all[uniprot_data_all["Entry"] == g]["Reviewed"].values[0],
                "Protein ID": g,
                "Sequence": uniprot_data_all[uniprot_data_all["Entry"] == g]["Sequence"].values[0],
                "Label": names[i],
            })

cleaned_data = pd.DataFrame(rows)
cleaned_data["Label"].value_counts()

P49945 not in uniprot data
A0A2I3BPG9 not in uniprot data


Label
S47    122
S45     91
Name: count, dtype: int64

**Only select the review-ed or longest protein per Majority.protein.IDs**

In [6]:
import pandas as pd
cleaned_data["SeqLength"] = cleaned_data["Sequence"].apply(len)

final_rows = []

for protein_group, group_df in cleaned_data.groupby(["Majority.protein.IDs", "Label"]):
    reviewed_df = group_df[group_df["Reviewed"] == "reviewed"]

    if len(reviewed_df) == 1:
        final_rows.append(reviewed_df.iloc[0])

    elif len(reviewed_df) > 1:
        reviewed_df["SeqLength"] = reviewed_df["Sequence"].apply(len)
        longest = reviewed_df.sort_values("SeqLength", ascending=False).iloc[0]
        final_rows.append(longest)

    else:
        group_df["SeqLength"] = group_df["Sequence"].apply(len)
        longest = group_df.sort_values("SeqLength", ascending=False).iloc[0]
        final_rows.append(longest)

representations_45S_47S_df = pd.DataFrame(final_rows).drop(columns=["SeqLength"]).reset_index(drop=True)
representations_45S_47S_df["Length"] = representations_45S_47S_df["Sequence"].apply(len)
representations_45S_47S_df["Label"].value_counts()

Label
S47    118
S45     84
Name: count, dtype: int64

**Calculate single score metrics from localCIDER**

In [7]:
import pandas as pd
from functools import partial
from localcider.sequenceParameters import SequenceParameters
from tqdm import tqdm

_METRICS = {
    "length"                     : ("get_length",                 {}),
    "FCR"                        : ("get_FCR",                    {}),
    "NCPR"                       : ("get_NCPR",                   {}),
    "isoelectric_point"          : ("get_isoelectric_point",      {}),
    "molecular_weight"           : ("get_molecular_weight",       {}),
    "count_negative"             : ("get_countNeg",               {}),
    "count_positive"             : ("get_countPos",               {}),
    "count_neutral"              : ("get_countNeut",              {}),
    "fraction_negative"          : ("get_fraction_negative",      {}),
    "fraction_positive"          : ("get_fraction_positive",      {}),
    "fraction_expanding"         : ("get_fraction_expanding",     {}), 
    "fraction_disorder_promoting": ("get_fraction_disorder_promoting", {}),
    "kappa"                      : ("get_kappa",                  {}),
    "Omega"                      : ("get_Omega",                  {}),
    "mean_net_charge"            : ("get_mean_net_charge",        {}),
    "mean_hydropathy"            : ("get_mean_hydropathy",        {}),
    "uversky_hydropathy"         : ("get_uversky_hydropathy",     {}),
    "PPII_propensity"            : ("get_PPII_propensity",        {}),
    "delta"                      : ("get_delta",                  {}),
    "delta_max"                  : ("get_deltaMax",               {}),
}

def _aa_fraction_columns():
    aas = "ACDEFGHIKLMNPQRSTVWY"
    return {f"frac_{aa}": (f"_aa_fraction_{aa}", {}) for aa in aas}

_METRICS.update(_aa_fraction_columns())

def add_localcider_features(df: pd.DataFrame,
                            seq_col: str = "Sequence",
                            drop_bad: bool = True) -> pd.DataFrame:


    def _get_amino_acid_fraction(sp, aa):
        return sp.get_amino_acid_fractions().get(aa, pd.NA)
    
    out = df.copy()
    def preprocess_seq(seq: str):
        if "U" in seq:
            print(f"Replacing U → C in sequence: {seq}")
            seq = seq.replace("U", "C")
        return seq
    
    seq_params = []

    for idx, seq in tqdm(out[seq_col].items()):
        try:
            seq = preprocess_seq(seq)
            seq_params.append(SequenceParameters(seq))
        except Exception:
            seq_params.append(None)
            print(f"Bad sequence at index {idx}: {seq}")

    for col, (method, kwargs) in _METRICS.items():
        print(f"Computing {method}")
        values = []
        for sp in tqdm(seq_params):
            if sp is None:
                values.append(pd.NA)
                continue
            if method.startswith("_aa_fraction_"):
                aa = method.split("_")[-1]
                values.append(_get_amino_acid_fraction(sp, aa))
            else:
                func = getattr(sp, method, None)
                if func is None:
                    values.append(pd.NA)
                    continue
                values.append(func(**kwargs) if kwargs else func())
        out[col] = values

    return out

representations_45S_47S_df_with_cider = add_localcider_features(representations_45S_47S_df, seq_col="Sequence")
print(f"Added {len(representations_45S_47S_df.columns.difference(representations_45S_47S_df_with_cider.columns))} columns to the dataframe")

121it [00:00, 285.61it/s]

Replacing U → C in sequence: MAPHGRKRKAGAAPMETVDKREKLAEGATVVIEHCTSURVYGRHAAALSQALQLEAPELPVQVNPSKPRRGSFEVTLLRSDNSRVELWTGIKKGPPRKLKFPEPQEVVEELKKYLS


202it [00:00, 286.20it/s]


Replacing U → C in sequence: MPVDDCWLYFPASRGRTFVQTVWVAPTCPNCCWFPGFLPPVPRPPHVPRVLLRGPRGAVLPASRPSKTLPSSSQTPCPTDPCICPPPSTPDSRQEKNTQSELPNKKGQLQKLPTMNGSKDPPGSYDFDLIIIGGGSGGLAAAKEAAKFDKKVLVLDFVTPTPLGTRWGLGGTCVNVGCIPKKLMHQAALLGQALKDSRNYGWKVEDTVKHDWEKMTESVQSHIGSLNWGYRVALREKKVVYENAYGRFIGPHRIVATNNKGKEKIYSAERFLIATGERPRYLGIPGDKEYCISSDDLFSLPYCPGKTLVVGASYVALECAGFLAGIGLDVTVMVRSILLRGFDQDMANKIGEHMEEHGIKFIRQFVPTKIEQIEAGTPGRLRVTAQSTNSEETIEGEFNTVLLAVGRDSCTRTIGLETVGVKINEKTGKIPVTDEEQTNVPYIYAIGDILEGKLELTPVAIQAGRLLAQRLYGGSNVKCDYDNVPTTVFTPLEYGCCGLSEEKAVEKFGEENIEVYHSFFWPLEWTVPSRDNNKCYAKIICNLKDDERVVGFHVLGPNAGEVTQGFAAALKCGLTKQQLDSTIGIHPVCAEIFTTLSVTKRSGGDILQSGCUG
Computing get_length


100%|██████████| 202/202 [00:00<00:00, 686031.91it/s]


Computing get_FCR


100%|██████████| 202/202 [00:00<00:00, 68106.87it/s]


Computing get_NCPR


100%|██████████| 202/202 [00:00<00:00, 70212.10it/s]


Computing get_isoelectric_point


100%|██████████| 202/202 [00:00<00:00, 617.93it/s]


Computing get_molecular_weight


100%|██████████| 202/202 [00:00<00:00, 27455.50it/s]


Computing get_countNeg


100%|██████████| 202/202 [00:00<00:00, 155658.54it/s]


Computing get_countPos


100%|██████████| 202/202 [00:00<00:00, 102560.15it/s]


Computing get_countNeut


100%|██████████| 202/202 [00:00<00:00, 103461.89it/s]


Computing get_fraction_negative


100%|██████████| 202/202 [00:00<00:00, 111627.06it/s]


Computing get_fraction_positive


100%|██████████| 202/202 [00:00<00:00, 117853.58it/s]


Computing get_fraction_expanding


100%|██████████| 202/202 [00:00<00:00, 63421.62it/s]


Computing get_fraction_disorder_promoting


100%|██████████| 202/202 [00:00<00:00, 11156.24it/s]


Computing get_kappa


100%|██████████| 202/202 [02:02<00:00,  1.65it/s]


Computing get_Omega


100%|██████████| 202/202 [19:25<00:00,  5.77s/it] 


Computing get_mean_net_charge


100%|██████████| 202/202 [00:00<00:00, 61730.38it/s]


Computing get_mean_hydropathy


100%|██████████| 202/202 [00:00<00:00, 1350.18it/s]


Computing get_uversky_hydropathy


100%|██████████| 202/202 [00:00<00:00, 8626.30it/s]


Computing get_PPII_propensity


100%|██████████| 202/202 [00:00<00:00, 1252.26it/s]


Computing get_delta


100%|██████████| 202/202 [00:01<00:00, 110.15it/s]


Computing get_deltaMax


100%|██████████| 202/202 [00:00<00:00, 536573.41it/s]


Computing _aa_fraction_A


100%|██████████| 202/202 [00:00<00:00, 14738.36it/s]


Computing _aa_fraction_C


100%|██████████| 202/202 [00:00<00:00, 13610.87it/s]


Computing _aa_fraction_D


100%|██████████| 202/202 [00:00<00:00, 8121.17it/s]


Computing _aa_fraction_E


100%|██████████| 202/202 [00:00<00:00, 15072.39it/s]


Computing _aa_fraction_F


100%|██████████| 202/202 [00:00<00:00, 15332.61it/s]


Computing _aa_fraction_G


100%|██████████| 202/202 [00:00<00:00, 12906.73it/s]


Computing _aa_fraction_H


100%|██████████| 202/202 [00:00<00:00, 11911.28it/s]


Computing _aa_fraction_I


100%|██████████| 202/202 [00:00<00:00, 11605.04it/s]


Computing _aa_fraction_K


100%|██████████| 202/202 [00:00<00:00, 15961.74it/s]


Computing _aa_fraction_L


100%|██████████| 202/202 [00:00<00:00, 12938.66it/s]


Computing _aa_fraction_M


100%|██████████| 202/202 [00:00<00:00, 11939.14it/s]


Computing _aa_fraction_N


100%|██████████| 202/202 [00:00<00:00, 14815.68it/s]


Computing _aa_fraction_P


100%|██████████| 202/202 [00:00<00:00, 15975.29it/s]


Computing _aa_fraction_Q


100%|██████████| 202/202 [00:00<00:00, 12658.17it/s]


Computing _aa_fraction_R


100%|██████████| 202/202 [00:00<00:00, 12969.16it/s]


Computing _aa_fraction_S


100%|██████████| 202/202 [00:00<00:00, 12853.86it/s]


Computing _aa_fraction_T


100%|██████████| 202/202 [00:00<00:00, 13225.05it/s]


Computing _aa_fraction_V


100%|██████████| 202/202 [00:00<00:00, 12701.06it/s]


Computing _aa_fraction_W


100%|██████████| 202/202 [00:00<00:00, 16136.55it/s]


Computing _aa_fraction_Y


100%|██████████| 202/202 [00:00<00:00, 11782.90it/s]


Index(['FCR', 'NCPR', 'Omega', 'PPII_propensity', 'count_negative',
       'count_neutral', 'count_positive', 'delta', 'delta_max', 'frac_A',
       'frac_C', 'frac_D', 'frac_E', 'frac_F', 'frac_G', 'frac_H', 'frac_I',
       'frac_K', 'frac_L', 'frac_M', 'frac_N', 'frac_P', 'frac_Q', 'frac_R',
       'frac_S', 'frac_T', 'frac_V', 'frac_W', 'frac_Y',
       'fraction_disorder_promoting', 'fraction_expanding',
       'fraction_negative', 'fraction_positive', 'isoelectric_point', 'kappa',
       'length', 'mean_hydropathy', 'mean_net_charge', 'molecular_weight',
       'uversky_hydropathy'],
      dtype='object')

In [9]:
representations_45S_47S_df_with_cider.to_csv("data/curated_data_for_modeling/45S_47S_localcider_features.tsv", sep="\t", index=False)