In [31]:
import re 
import pandas as pd
import numpy as np

from tqdm.auto import tqdm

In [32]:
df = pd.read_csv('swiss_prot_vec.tsv', delimiter='\t').rename(columns={
    'EC number':'EC',
    'Gene Ontology (molecular function)':'MF',
    'Gene Ontology (biological process)':'BP',
    'Gene Ontology (cellular component)':'CC',
    'InterPro':'IP',
    'Gene3D':'3D',
    'Sequence':'seqs'
})
df.head()

Unnamed: 0,Entry,EC,Cofactor,MF,BP,CC,IP,3D,seqs
0,A0A009IHW8,3.2.2.-; 3.2.2.6,,NAD(P)+ nucleosidase activity [GO:0050135]; NA...,NAD catabolic process [GO:0019677]; signal tra...,,IPR000157;IPR035897;,3.40.50.10140;,MSLEQKKGADIISKILQIQNSIGKTTSPSTLKTKLSEISRKEQENA...
1,A0A023I7E1,3.2.1.39,,"glucan endo-1,3-beta-D-glucosidase activity [G...",cell wall organization [GO:0071555]; polysacch...,extracellular region [GO:0005576],IPR005200;IPR040720;IPR040451;,1.10.287.1170;1.20.5.420;,MRFQVIVAAATITMITSYIPGVASQSTSDGDDLFVPVSNFDPKSIF...
2,A0A024B7W1,2.1.1.56; 2.1.1.57; 2.7.7.48; 3.4.21.91; 3.6.1...,,"4 iron, 4 sulfur cluster binding [GO:0051539];...",clathrin-dependent endocytosis of virus by hos...,centrosome [GO:0005813]; extracellular region ...,IPR043502;IPR000069;IPR038302;IPR013755;IPR001...,1.10.10.930;1.10.260.90;1.20.1280.260;2.40.10....,MKNPKKKSGGFRIVNMLKRGVARVSPFGGLKRLPAGLLLGHGPIRM...
3,A0A024SC78,3.1.1.74,,cutinase activity [GO:0050525],,extracellular region [GO:0005576],IPR029058;IPR000675;IPR043580;IPR043579;IPR011...,3.40.50.1820;,MRSLAILTTLLAGHAFAYPKPAPQSVNRRDWPSINEFLSELAKVMP...
4,A0A024SH76,3.2.1.91,,"cellulose 1,4-beta-cellobiosidase activity [GO...",cellulose catabolic process [GO:0030245],extracellular region [GO:0005576],IPR016288;IPR036434;IPR035971;IPR000254;IPR001...,3.20.20.40;,MIVGILTTLATLATLAASVPLEERQACSSVWGQCGGQNWSGPTCCA...


In [67]:
def create_dictionary(input):
    output_dict = {}
    for index, entry in enumerate(input, start=1):
        output_dict[index] = entry
    return output_dict


def ec_processing(input_list):
    output_list = []
    for item in tqdm(input_list):
        descriptors = str(item).split(';')
        filtered_descriptors = [d.strip() for d in descriptors if '-' not in d and 'n' not in d]
        if len(filtered_descriptors) > 1:
            output_list.extend(filtered_descriptors)
        elif len(filtered_descriptors) == 1:
            output_list.append(filtered_descriptors[0])
        else:
            continue
    return output_list


def go_processing(input_list):
    output_list = []
    for item in tqdm(input_list):
        descriptors = str(item).split(';')
        filtered_descriptors = [d[d.find('[GO:')+1:d.find(']')].strip() for d in descriptors]
        if len(filtered_descriptors) > 1:
            output_list.extend(filtered_descriptors)
        elif len(filtered_descriptors) == 1:
            output_list.append(filtered_descriptors[0])
        else:
            continue
    return output_list


def cofactor_processing(input_list):
    output_list = []
    for item in tqdm(input_list):
        descriptors = str(item).split(';')
        filtered_descriptors = [d[d.find('Name=')+5:].strip() for d in descriptors if 'Name' in d]
        if len(filtered_descriptors) > 1:
            output_list.extend(filtered_descriptors)
        elif len(filtered_descriptors) == 1:
            output_list.append(filtered_descriptors[0])
        else:
            continue
    return output_list


def domain_processing(input_list):
    output_list = []
    for item in tqdm(input_list):
        descriptors = str(item).split(';')
        filtered_descriptors = [d.strip() for d in descriptors]
        if len(filtered_descriptors) > 1:
            output_list.extend(filtered_descriptors)
        elif len(filtered_descriptors) == 1:
            output_list.append(filtered_descriptors[0])
        else:
            continue
    return output_list


In [33]:
# EC
ecs = df['EC'].tolist()
ecs = ec_processing(ecs)
ecs = sorted(list(set(ecs)))
ec_dict = create_dictionary(ecs)
len(ecs), ecs[0]

  0%|          | 0/570830 [00:00<?, ?it/s]

(5487, '1.1.1.1')

In [52]:
# MF
mfs = df['MF'].tolist()
mfs = go_processing(mfs)
mfs = sorted(list(set(mfs)))
mfs.pop(0)
mf_dict = create_dictionary(mfs)
len(mfs), mfs[0]

  0%|          | 0/570830 [00:00<?, ?it/s]

(8156, 'GO:0000006')

In [56]:
# BP
bps = df['BP'].tolist()
bps = go_processing(bps)
bps = sorted(list(set(bps)))
bps.pop(0)
bp_dict = create_dictionary(bps)
len(bps), bps[0]

  0%|          | 0/570830 [00:00<?, ?it/s]

(17892, 'GO:0000001')

In [55]:
# CC
ccs = df['CC'].tolist()
ccs = go_processing(ccs)
ccs = sorted(list(set(ccs)))
ccs.pop(0)
cc_dict = create_dictionary(ccs)
len(ccs), ccs[0]

  0%|          | 0/570830 [00:00<?, ?it/s]

(2852, 'GO:0000015')

In [60]:
# IP
ips = df['IP'].tolist()
ips = domain_processing(ips)
ips = sorted(list(set(ips)))
ips.pop(0)
ip_dict = create_dictionary(ips)
len(ips), ips[0]

  0%|          | 0/570830 [00:00<?, ?it/s]

(33653, 'IPR000001')

In [80]:
# cofactor
cos = df['Cofactor'].tolist()
cos = cofactor_processing(cos)
cos = sorted(list(set(cos)))
cos.pop(0)
co_dict = create_dictionary(cos)
len(cos), cos[0]

  0%|          | 0/570830 [00:00<?, ?it/s]

(113, '(6R)-L-erythro-5,6,7,8-tetrahydrobiopterin')

In [83]:
all_classes = ecs + mfs + bps + ccs + ips + cos
all_dict = create_dictionary(all_classes)
len(all_dict.keys())

68153

In [92]:
seqs = df['seqs'].tolist()
entry_ids = df['Entry'].tolist()
len(seqs), len(list(set(seqs)))

(570830, 482684)

In [94]:
counts = {}
duplicates = {}

for i, seq in enumerate(seqs):
    if seq in counts:
        counts[seq] += 1
        duplicates[seq].append(entry_ids[i])
    else:
        counts[seq] = 1
        duplicates[seq] = [entry_ids[i]]

# Write duplicates to a text file
with open("swiss_prot_duplicates.txt", "w") as file:
    for seq, ids in duplicates.items():
        if len(ids) > 1:
            count = len(ids)
            file.write(f"{count}\t{' '.join(ids)}\n")
            file.write(f"{seq}\n")