# Initial preprocessing for curated and uncurated protein structures

### Load libraries and files

In [1]:
import os
import pandas as pd
import math 
from collections import defaultdict 
import matplotlib.pyplot as plt

DATA_DIR="../data"

In [2]:
# Access &read the file - Currated
file_path = os.path.join(DATA_DIR, "SPARCLE_IDS_curated_simplified.csv")
df_cur = pd.read_csv(file_path)

#test succ. loading
df_cur.head()

Unnamed: 0,ArchId,CurVer,CurLabel,CurName,CurName_simplified,superfamilyarch,SpecificArch,TitleStrings,Taxid,IsSpec,Status,LabelState,archLen_max,archLen_med,ArchId_string,SuperFamID_string
0,11436966,2,"(2,3-dihydroxybenzoyl)adenylate synthase catal...","(2,3-dihydroxybenzoyl)adenylate synthase","(2,3-dihydroxybenzoyl)adenylate synthase",AMP-binding,COG1021,Non-ribosomal peptide synthetase component E (...,1,1,published,curated,,,6446,0
1,11494111,2,"(2,3-dihydroxybenzoyl)adenylate synthase catal...","(2,3-dihydroxybenzoyl)adenylate synthase","(2,3-dihydroxybenzoyl)adenylate synthase",AMP-binding,TIGR02275,"2,3-dihydroxybenzoate-AMP ligase",1,1,published,curated,,,53958,0
2,11494111,2,"(2,3-dihydroxybenzoyl)adenylate synthase catal...","(2,3-dihydroxybenzoyl)adenylate synthase","(2,3-dihydroxybenzoyl)adenylate synthase",DHB_AMP_lig,TIGR02275,"2,3-dihydroxybenzoate-AMP ligase",1,1,published,curated,,,53958,37086
3,11436966,2,"(2,3-dihydroxybenzoyl)adenylate synthase catal...","(2,3-dihydroxybenzoyl)adenylate synthase","(2,3-dihydroxybenzoyl)adenylate synthase",EntE,COG1021,Non-ribosomal peptide synthetase component E (...,1,1,published,curated,,,6446,34064
4,11485118,2,"(2,3-dihydroxybenzoyl)adenylate synthase catal...","(2,3-dihydroxybenzoyl)adenylate synthase","(2,3-dihydroxybenzoyl)adenylate synthase",entE,PRK10946,"(2,3-dihydroxybenzoyl)adenylate synthase",1,1,published,curated,,,32482,35977


## Filtering by Columns

In [3]:
df_cur = df_cur[['ArchId', 'CurName_simplified', 'superfamilyarch', 'SpecificArch', 'TitleStrings']]

## Further preprocessing

### Lowercase

In [4]:
df_cur.loc[:, ['superfamilyarch', 'SpecificArch', 'TitleStrings']] = df_cur.loc[:, ['CurName_simplified', 'superfamilyarch', 'SpecificArch', 'TitleStrings']].apply(lambda x: x.str.lower())

## Filter specific words in TitleStrings

Integration of data obtained from file "CM_XXX"

In [5]:
words_to_remove = ['domain', 'domains', ' of ', ' and ', ' the ', ' a ', 'found', 'function', 'type', 'protein', 'proteins', 'N/A', 'n/a']

In [6]:
# Creating a regex pattern to match any word in the list of words to remove
pattern = '|'.join(r"\b{}\b".format(word) for word in words_to_remove)

# Replacing the words in the 'Text' column with an empty string
df_cur['TitleStrings'] = df_cur['TitleStrings'].str.replace(pattern, ' ', regex=True)
df_cur['TitleStrings'] = df_cur['TitleStrings'].str.replace('  ', ' ', regex=True)
df_cur['TitleStrings'] = df_cur['TitleStrings'].str.replace('  ', ' ', regex=True)
df_cur['TitleStrings'] = df_cur['TitleStrings'].str.replace(' , ', ', ', regex=True)

In [8]:
df_cur.to_csv('SPARCLE_IDS_curated_simplified_modTitleStrings.csv', index=False)

In [47]:
import collections

In [70]:
strings = list(set(df_cur.loc[~df_cur['TitleStrings'].isna()]['TitleStrings']))
all_strings = []
for string in strings:
    # string = strings[0]
    string = string.replace("| ","")
    string = string.replace(", ","")
    words = string.split()
    all_strings += words


counter = collections.Counter(all_strings)
counter.most_common(20)

[('family', 3714),
 ('in', 3713),
 ('unknown', 2614),
 ('similar', 2383),
 ('the', 2005),
 ('and', 1629),
 ('transport', 1578),
 ('kinase', 1382),
 ('1', 1272),
 ('catalytic', 1109),
 ('subunit', 1036),
 ('metabolism]', 1022),
 ('synthase', 1017),
 ('binding', 975),
 ('2', 968),
 ('uncharacterized', 946),
 ('motif', 945),
 ('superfamily', 941),
 ('rna', 901),
 ('zinc', 896)]