# Initial preprocessing for curated and uncurated protein structures

### Load libraries and files

In [1]:
import os
import pandas as pd
import math 
from collections import defaultdict 
import matplotlib.pyplot as plt

DATA_DIR="../data"

In [2]:
# Access &read the file - Currated
file_path = os.path.join(DATA_DIR, "SPARCLE_IDS_curated_simplified.csv")
df_cur = pd.read_csv(file_path)

#test succ. loading
#df_cur.head()

In [3]:
# Access &read the file - Currated
file_path = os.path.join(DATA_DIR, "SPARCLE_IDS_UNcurated_TITLES.csv.gz")
df_UNcur = pd.read_csv(file_path, compression='gzip')

#test succ. loading
#df_uncur.head()

## Keep essential columns 

In [4]:
df_cur = df_cur[['ArchId', 'CurName_simplified', 'superfamilyarch', 'SpecificArch', 'TitleStrings']]
df_UNcur = df_UNcur[['ArchId', 'CurName', 'superfamilyarch', 'SpecificArch', 'TitleStrings']]

## Further preprocessing

### Lowercase

In [5]:
df_cur.loc[:, ['superfamilyarch', 'SpecificArch', 'TitleStrings']] = df_cur.loc[:, ['CurName_simplified', 'superfamilyarch', 'SpecificArch', 'TitleStrings']].apply(lambda x: x.str.lower())
df_UNcur.loc[:, ['superfamilyarch', 'SpecificArch', 'TitleStrings']] = df_UNcur.loc[:, ['CurName', 'superfamilyarch', 'SpecificArch', 'TitleStrings']].apply(lambda x: x.str.lower())

### Filter specific words in TitleStrings

Integration of data obtained from file "CM_XXX"

In [6]:
words_to_remove = ['domain', 'domains', ' of ', ' and ', ' the ', ' a ', 'found', 'type', 'protein', 'proteins', 'N/A', 'n/a']

In [7]:
# Creating a regex pattern to match any word in the list of words to remove
pattern = '|'.join(r"\b{}\b".format(word) for word in words_to_remove)

# Replacing the words in the 'Text' column with an empty string
df_cur.loc['TitleStrings'] = df_cur['TitleStrings'].str.replace(pattern, ' ', regex=True)
df_cur.loc['TitleStrings'] = df_cur['TitleStrings'].str.replace('  ', ' ', regex=True).str.replace('  ', ' ', regex=True)
df_cur.loc['TitleStrings'] = df_cur['TitleStrings'].str.replace(' , ', ', ', regex=True)

# Replacing the words in the 'Text' column with an empty string
df_UNcur.loc['TitleStrings'] = df_UNcur['TitleStrings'].str.replace(pattern, ' ', regex=True)
df_UNcur.loc['TitleStrings'] = df_UNcur['TitleStrings'].str.replace('  ', ' ', regex=True).str.replace('  ', ' ', regex=True)
df_UNcur.loc['TitleStrings'] = df_UNcur['TitleStrings'].str.replace(' , ', ', ', regex=True)

In [8]:
df_cur.to_csv('../data/SPARCLE_IDS_curated_simplified_modTitleStrings.csv', index=False)
df_UNcur.to_csv('../data/SPARCLE_IDS_UNcurated_TITLES_modTitleStrings.csv', index=False)

#### Test for overrepresented words in TitleStrings

In [9]:
import collections

In [40]:
strings = list(set(df_cur.loc[~df_cur['TitleStrings'].isna()]['TitleStrings']))
all_strings = []
for string in strings:
    # string = strings[0]
    string = string.replace("| ","")
    string = string.replace(", ","")
    words = string.split()
    all_strings += words


counter = collections.Counter(all_strings)
counter.most_common(20)

[('in', 18529),
 ('family', 16551),
 ('repeat', 13178),
 ('function', 12560),
 ('similar', 12249),
 ('the', 10390),
 ('transduction', 9456),
 ('mechanisms]', 8230),
 ('unknown', 8075),
 ('binding', 7314),
 ('1', 6707),
 ('c-terminal', 6687),
 ('[signal', 6609),
 ('and', 6558),
 ('homology', 6418),
 ('2', 5860),
 ('subunit', 5740),
 ('motif', 5740),
 ('transport', 5555),
 ('kinase', 5522)]

### Extract Unique SpecificArch to Dictionary

In [14]:
# Filter rows where SpecArchs has exactly one string & Find unique SpecArchs values
df_filtered = df_cur[df_cur['SpecificArch'].map(lambda x: len(str(x).split()) == 1)]
unique_specarchs = df_filtered['SpecificArch'].drop_duplicates(keep=False)

In [11]:
# Create dictionary with SpecArchs as keys and [superfam, CurLabel] as values
result_dict = {}
for specarch in unique_specarchs:
    row = df_filtered[df_filtered['SpecificArch'] == specarch]
    result_dict[specarch] = [row['superfamilyarch'].iloc[0], row['CurName_simplified'].iloc[0]]

In [12]:
len(result_dict.keys())

17417

In [13]:
import json

# Assuming result_dict is your dictionary
with open('../data/uniqueSpecificArch_dict.json', 'w') as json_file:
    json.dump(result_dict, json_file)

In [16]:
with open('../data/uniqueSpecificArch_dict.json', 'r') as json_file:
    result_dict = json.load(json_file)

### Removal of Unique SpecificArch

In [19]:
df_cur.shape

(42767, 5)

In [20]:
keys_to_remove = result_dict.keys()
# Remove rows from df_cur where SpecArchs is a key in result_dict
df_cur2 = df_cur[~df_cur['SpecificArch'].isin(keys_to_remove)]

In [21]:
df_cur2.shape

(25350, 5)

## Prediction of Uncurated Structures

In [27]:
df_filtered = df_UNcur[df_UNcur['SpecificArch'].map(lambda x: len(str(x).split()) == 1)]
unique_specarchs_UNcur = df_filtered['SpecificArch'].drop_duplicates(keep=False)

In [26]:
df_filtered

Unnamed: 0,ArchId,CurName,superfamilyarch,SpecificArch,TitleStrings
0,48.0,Gag_p24 domain-containing protein,gag_p24,,
1,39.0,Gag_p17 domain-containing protein,gag_p17,,
2,28.0,Ribosomal_S18 domain-containing protein,ribosomal_s18,,
3,52.0,VPR domain-containing protein,vpr,,
4,35.0,TOPRIM domain-containing protein,toprim,,
...,...,...,...,...,...
178201,18342769.0,caspase family protein,ead2,pfam19956,effector-associated domain 2
178206,18343690.0,MarR family transcriptional regulator,duf6432,pfam20024,family of unknown function (duf6432)
178222,18345980.0,,duf6532,pfam20149,domain of unknown function (duf6532)
178226,18346698.0,HI1514 family protein; tape measure protein,tmp_3,pfam20155,tape measure protein


In [28]:
keys_present = [key for key in unique_specarchs_UNcur if key in result_dict]

In [29]:
keys_present

['cog1649',
 'prk00847',
 'prk09509',
 'prk10518',
 'prk10306',
 'tigr01307',
 'tigr02170',
 'cd07268',
 'pfam01710',
 'pfam00487',
 'pfam05636',
 'pfam00872',
 'cd01121']