In [6]:
import pandas as pd
from pypdf import PdfReader

In [7]:
def process_csv(path_csv):
    df = pd.read_csv(path_csv, header=None)
    
    new_column_names = {0: "identifier", 1: "sequence", 2: "class_binder"}
    df = df.rename(columns=new_column_names)
    
    def process_id(x):
        if len(x) > 5:
            id_ = x.split("|")[1]
            return id_
        else:
            return x
    
    df.insert(1, "class_binder", df.pop("class_binder"))
    
    df["identifier"] = df["identifier"].apply(process_id)
    return df

In [8]:
def process_text(data_seq, mode="binders"):
    sequences = []
    identifiers = []

    text = data_seq.split("\n")
    text = [item for item in text if item]
    
    seq = ""
    for item in text:
        if item[0] == ">":
            identifiers.append(item[1:])
            if seq:
                seq = "".join([char for char in seq if not char.isdigit()])
                sequences.append(seq)
                seq = ""
        elif not item[0].isdigit():
            seq += item
    
    seq = "".join([char for char in seq if not char.isdigit()])
    sequences.append(seq)
    
    assert len(identifiers) == len(sequences)
    
    if mode == "binders":
        class_binders = [1] * len(sequences)
    else:
        class_binders = [0] * len(sequences)
    
    df = pd.DataFrame({"identifier": identifiers, "class_binder": class_binders, "sequence": sequences})
    return df

In [9]:
def parse_pdf_v2(path, split_1, split_2):
    pdf = PdfReader(path)
    text = ""
    for page in pdf.pages:
        text += page.extract_text()
        
    text = text.replace(" ", "")
    _, text = text.split(f"{split_1}DNA-bindingproteins")
    
    binders, not_binders = text.split(f"(2).{split_2}nonDNA-bindingproteins")
    return binders, not_binders

In [10]:
def process_col_id(x):
    id_ = x.split("|")[1]
    return id_

In [11]:
def parse_text_v5(path, mode="binders"):
    with open(path) as fi:
        lines = fi.readlines()
    lines = [line.strip() for line in lines if line.strip()]
    
    identifiers = []
    for head in lines:
        identifiers.append(head)
    
    if mode == "binders":
        class_binders = [1] * len(identifiers)
    else:
        class_binders = [0] * len(identifiers)
    
    df = pd.DataFrame({"identifier": identifiers, "class_binder": class_binders})
    return df

In [12]:
pdb67151 = process_csv("../data/pdb_data_papers/DBP/train-17151+50000-unbalance.csv")

In [13]:
binders, not_binders = parse_pdf_v2("../data/pdb_data_papers/DBP/minf_201400025_sm_s4.pdf", "525", "550")

In [14]:
pdb1075p1 = process_text(binders, mode="binders")
pdb1075p2 = process_text(not_binders, mode="not_binders")
pdb1075 = pd.concat([pdb1075p1, pdb1075p2])

In [15]:
pdb20000_p1 = pd.read_csv("../data/pdb_data_papers/DBP/test-10000unit-negative.csv", header=None)
pdb20000_p2 = pd.read_csv("../data/pdb_data_papers/DBP/test-10000unit-positive.csv", header=None)
pdb20000 = pd.concat([pdb20000_p1, pdb20000_p2])
pdb20000.columns = ["identifier", "sequence", "class_binder"]
pdb20000["identifier"] = pdb20000["identifier"].apply(process_col_id)

In [16]:
pdb1000_p1 = pd.read_csv("../data/pdb_data_papers/DBP/test-500unit-positive.csv", header=None)
pdb1000_p2 = pd.read_csv("../data/pdb_data_papers/DBP/test-500unit-negative.csv", header=None)
pdb1000 = pd.concat([pdb1000_p1, pdb1000_p2])
pdb1000.columns = ["identifier", "sequence", "class_binder"]
pdb1000["identifier"] = pdb1000["identifier"].apply(process_col_id)

In [17]:
pdb14189_p1 = parse_text_v5("../data/pdb_data_papers/DBP/Table S1_7131_DNA-binding_proteins.txt",
                        mode="binders")
pdb14189_p2 = parse_text_v5("../data/pdb_data_papers/DBP/Table S1_7131_non_DNA-binding_proteins.txt",
                        mode="not_binders")

In [18]:
new_df = pd.read_csv("../data/pdb_data_papers/DBP/null_seqs_df.csv")

In [19]:
new_df = new_df.loc[:, ["identifier", "sequence"]]

In [20]:
pdb14189_p1 = pdb14189_p1.merge(new_df, on="identifier", how="inner")
pdb14189_p2 = pdb14189_p2.merge(new_df, on="identifier", how="inner")

In [21]:
pdb14189 = pd.concat([pdb14189_p1, pdb14189_p2])
pdb14189 = pdb14189.loc[~pdb14189["identifier"].duplicated()]

In [22]:
def check_duplicates(df):
    return df["identifier"].duplicated().any()

In [23]:
check_duplicates(pdb20000)

False

In [24]:
check_duplicates(pdb14189)

False

In [25]:
check_duplicates(pdb1000)

False

In [26]:
check_duplicates(pdb67151)

False

In [27]:
check_duplicates(pdb1075)

False

In [29]:
df = pd.concat([pdb20000, pdb14189, pdb1000, pdb67151, pdb1075])

In [35]:
df[df["identifier"].duplicated()]

Unnamed: 0,identifier,sequence,class_binder
1,A6NI15,MDNLRETFLSLEDGLGSSDSPGLLSSWDWKDRAGPFELNQASPSQS...,1
6,Q0V7X4,MEGRVNALSNINDLELHNFLVDPNFDQFINLIRGDHQTIDENPVLD...,1
9,Q60EQ4,MADGPGSPGGGGGSHESGSPRGGGGGGGGGGGGGGVREQDRFLPIA...,1
14,Q8ITI5,MVVTTATKPHPFSIENILKSASPKPQKPLFSYNALIAMAISQSPLK...,1
15,Q8GXM7,MALSPNSSSLDLTISIPSFSPSPSLGDHHGMRDFDINQTPKTEEDR...,1
...,...,...,...
67146,Q9FN69,MATGQNRTTVPENLKKHLAVSVRNIQWSYGIFWSVSASQSGVLEWG...,1
67147,P0ACL5,MKDERRPICEVVAESIERLIIDGVLKVGQPLPSERRLCEKLGFSRS...,1
67148,P48590,MAATNTHYYADSMYNMYHHALPPTYYDNTSSSSSYYQSSQGWQPAS...,1
67149,P02835,MATTNSQSHYSYADNMNMYNMYHPHSLPPTYYDNSGSNAYYQNTSN...,1


In [36]:
df[df["identifier"] == "Q9L9G1"]

Unnamed: 0,identifier,sequence,class_binder
2456,Q9L9G1,MTNSGDEEITPASLKATRKGERVSIGSLLPPSELVRSGESTEHIRV...,1
30,Q9L9G1,MTNSGDEEITPASLKATRKGERVSIGSLLPPSELVRSGESTEHIRV...,1
64125,Q9L9G1,MTNSGDEEITPASLKATRKGERVSIGSLLPPSELVRSGESTEHIRV...,1


In [164]:
def save_csv(df, name):
    path_csv = "../data/pdb_data_papers/DBP/"
    path_csv += name
    df.to_csv(path_csv, index=False)

In [165]:
save_csv(pdb20000, "pdb20000.csv")

In [166]:
save_csv(pdb1075, "pdb1075.csv")

In [167]:
save_csv(pdb67151, "pdb67151.csv")

In [168]:
save_csv(pdb1000, "pdb1000.csv")

In [169]:
save_csv(pdb14189, "pdb14189.csv")

In [174]:
pdb14189.head()

Unnamed: 0,identifier,class_binder,sequence
0,Q6A8L0,1,MSGHSKWATTKHKKAAIDAKRGKLFARLIKNIEVAARLGGGDPSGN...
1,A6NI15,1,MDNLRETFLSLEDGLGSSDSPGLLSSWDWKDRAGPFELNQASPSQS...
2,Q8N1G0,1,MGDMKTPDFDDLLAAFDIPDIDANEAIHSGPEENEGPGGPGKPEPG...
3,Q83NU2,1,MDATVTVVGNLTADPELRYTATGAAVVNMTIASTPRMYDRQSGEWK...
4,P48383,1,MNPSDLPGQIPLSRSDMNVQDQLDPVQRFDTHFMLPQEENFLNRPS...


In [39]:
dbp_biolip = pd.read_csv("../data/biolip/dna_binders_biolip.csv")
dbp_biolip.drop(["source"], axis=1, inplace=True)

In [40]:
dbp_biolip = dbp_biolip.drop_duplicates(subset=["sequence"])

In [53]:
pdb67151.sequence.duplicated().sum()

8418

In [54]:
df = pd.concat([dbp_biolip, pdb67151])

In [55]:
df.sequence.duplicated().sum()

8441