In [1]:
import pandas as pd
from pypdf import PdfReader

In [2]:
def process_csv(path_csv):
    df = pd.read_csv(path_csv, header=None)
    
    new_column_names = {0: "identifier", 1: "sequence", 2: "class_binder"}
    df = df.rename(columns=new_column_names)
    
    def process_id(x):
        if len(x) > 5:
            id_ = x.split("|")[1]
            return id_
        else:
            return x
    
    df["identifier"] = df["identifier"].apply(process_id)
    return df

In [3]:
def process_text(data_seq, mode="binders"):
    sequences = []
    identifiers = []

    text = data_seq.split("\n")
    text = [item for item in text if item]
    
    seq = ""
    for item in text:
        if item[0] == ">":
            identifiers.append(item[1:])
            if seq:
                seq = "".join([char for char in seq if not char.isdigit()])
                sequences.append(seq)
                seq = ""
        elif not item[0].isdigit():
            seq += item
    
    seq = "".join([char for char in seq if not char.isdigit()])
    sequences.append(seq)
    
    assert len(identifiers) == len(sequences)
    
    if mode == "binders":
        class_binders = [1] * len(sequences)
    else:
        class_binders = [0] * len(sequences)
    
    df = pd.DataFrame({"identifier": identifiers, "class_binder": class_binders, 
                       "sequence": sequences})
    return df

In [4]:
def parse_pdf_v2(path, split_1, split_2):
    pdf = PdfReader(path)
    text = ""
    for page in pdf.pages:
        text += page.extract_text()
        
    text = text.replace(" ", "")
    _, text = text.split(f"{split_1}DNA-bindingproteins")
    
    binders, not_binders = text.split(f"(2).{split_2}nonDNA-bindingproteins")
    return binders, not_binders

In [5]:
def process_col_id(x):
    id_ = x.split("|")[1]
    return id_

In [6]:
def parse_text_v5(path, mode="binders"):
    with open(path) as fi:
        lines = fi.readlines()
    lines = [line.strip() for line in lines if line.strip()]
    
    identifiers = []
    for head in lines:
        identifiers.append(head)
    
    if mode == "binders":
        class_binders = [1] * len(identifiers)
    else:
        class_binders = [0] * len(identifiers)
    
    df = pd.DataFrame({"identifier": identifiers, "class_binder": class_binders})
    return df

In [7]:
pdb67151 = process_csv("../data/DBP_papers_data/train-17151+50000-unbalance.csv")

In [8]:
binders, not_binders = parse_pdf_v2("../data/DBP_papers_data/minf_201400025_sm_s4.pdf", 
                                    "525", "550")

In [9]:
pdb1075p1 = process_text(binders, mode="binders")
pdb1075p2 = process_text(not_binders, mode="not_binders")
pdb1075 = pd.concat([pdb1075p1, pdb1075p2])

In [10]:
pdb20000_p1 = pd.read_csv("../data/DBP_papers_data/test-10000unit-negative.csv", header=None)
pdb20000_p2 = pd.read_csv("../data/DBP_papers_data/test-10000unit-positive.csv", header=None)
pdb20000 = pd.concat([pdb20000_p1, pdb20000_p2])
pdb20000.columns = ["identifier", "sequence", "class_binder"]
pdb20000["identifier"] = pdb20000["identifier"].apply(process_col_id)

In [11]:
pdb1000_p1 = pd.read_csv("../data/DBP_papers_data/test-500unit-positive.csv", header=None)
pdb1000_p2 = pd.read_csv("../data/DBP_papers_data/test-500unit-negative.csv", header=None)
pdb1000 = pd.concat([pdb1000_p1, pdb1000_p2])
pdb1000.columns = ["identifier", "sequence", "class_binder"]
pdb1000["identifier"] = pdb1000["identifier"].apply(process_col_id)

In [12]:
pdb14189_p1 = parse_text_v5("../data/DBP_papers_data/Table S1_7131_DNA-binding_proteins.txt",
                        mode="binders")
pdb14189_p2 = parse_text_v5("../data/DBP_papers_data/Table S1_7131_non_DNA-binding_proteins.txt",
                        mode="not_binders")

In [13]:
len(pdb14189_p1) + len(pdb14189_p2)

14262

In [15]:
new_df = pd.read_csv("../data/DBP_papers_data/null_seqs_df.csv")

In [16]:
new_df = new_df.loc[:, ["identifier", "sequence"]]

In [17]:
pdb14189_p1 = pdb14189_p1.merge(new_df, on="identifier", how="inner")
pdb14189_p2 = pdb14189_p2.merge(new_df, on="identifier", how="inner")

In [18]:
pdb14189 = pd.concat([pdb14189_p1, pdb14189_p2])
len(pdb14189)

14262

In [19]:
pdb14189 = pdb14189.loc[~pdb14189["identifier"].duplicated()]

In [20]:
def search_float(item):
    return type(item) == float

In [21]:
pdb14189 = pdb14189.loc[~pdb14189["sequence"].apply(search_float)]

In [22]:
def check_duplicates(df):
    return df["identifier"].duplicated().any()

In [23]:
pdb20000.head()

Unnamed: 0,identifier,sequence,class_binder
0,P98196,MDCSLVRTLVHRYCAGEENWVDSRTIYVGHREPPPGAEAYIPQRYP...,0
1,Q9N0Z4,LGFDPPHQSDTRTIYIANRFPQNGLYTPQKFIDNRIISSKYTVWNF...,0
2,Q9QZW0,MFRRTLNRLCAGEEKRVGTRTVFVGNHPISGTEPYIAQRFCDNRIV...,0
3,Q8VDN2,MGKGVGRDKYEPAAVSEHGDKKGKKAKKERDMDELKKEVSMDDHKL...,0
4,P04074,MGKGVGRDKYEPAAVSEHGDKKKAKKERDMDELKKEVSMDDHKLSL...,0


In [25]:
pdb186 = pd.read_csv("../data/DBP_papers_data/pdb186.csv")

In [26]:
def process_id(x, source="pdb20000"):
    upd_id = x + "_" + source
    return upd_id

In [27]:
check_duplicates(pdb20000)

False

In [28]:
check_duplicates(pdb186)

False

In [29]:
check_duplicates(pdb14189)

False

In [30]:
check_duplicates(pdb1000)

False

In [31]:
check_duplicates(pdb67151)

False

In [32]:
check_duplicates(pdb1075)

False

In [36]:
pdb20000["identifier"] = pdb20000["identifier"].apply(lambda x: process_id(x, source="pdb20000"))

In [37]:
pdb1075["identifier"] = pdb1075["identifier"].apply(lambda x: process_id(x, source="pdb1075"))

In [38]:
pdb14189["identifier"] = pdb14189["identifier"].apply(lambda x: process_id(x, source="pdb14189"))

In [39]:
pdb1000["identifier"] = pdb1000["identifier"].apply(lambda x: process_id(x, source="pdb1000"))

In [40]:
pdb67151["identifier"] = pdb67151["identifier"].apply(lambda x: process_id(x, source="pdb67151"))

In [41]:
pdb186["identifier"] = pdb186["identifier"].apply(lambda x: process_id(x, source="pdb186"))

In [42]:
df = pd.concat([pdb20000, pdb14189, pdb1000, pdb67151, pdb1075, pdb186])

In [43]:
df.head()

Unnamed: 0,identifier,sequence,class_binder
0,P98196_pdb20000,MDCSLVRTLVHRYCAGEENWVDSRTIYVGHREPPPGAEAYIPQRYP...,0
1,Q9N0Z4_pdb20000,LGFDPPHQSDTRTIYIANRFPQNGLYTPQKFIDNRIISSKYTVWNF...,0
2,Q9QZW0_pdb20000,MFRRTLNRLCAGEEKRVGTRTVFVGNHPISGTEPYIAQRFCDNRIV...,0
3,Q8VDN2_pdb20000,MGKGVGRDKYEPAAVSEHGDKKGKKAKKERDMDELKKEVSMDDHKL...,0
4,P04074_pdb20000,MGKGVGRDKYEPAAVSEHGDKKKAKKERDMDELKKEVSMDDHKLSL...,0


In [46]:
def valid_sequence(sequence: str) -> bool:
    valid_amino_acids = "SNYLRQDPMFCEWGTKIVAH"
    return all(char in valid_amino_acids for char in sequence)

In [52]:
df = df.loc[df["sequence"].apply(valid_sequence)]

In [55]:
def save_csv(df, name):
    path_csv = "../data/DBP_papers_data/"
    path_csv += name
    df.to_csv(path_csv, index=False)

In [56]:
save_csv(df, "benchmark_data.csv")