In [35]:
from collections import defaultdict
import pandas as pd
import numpy as np
import os, re, time, requests, sys
from retrying import retry
from multiprocessing.dummy import Pool
from Bio.PDB.MMCIF2Dict import MMCIF2Dict
sys.path.append('./')
from Unit import Unit

In [62]:
class MMCIF_unit(Unit):
    CONFIG = {
        'MMCIF_OLD_FOLDER': ['C:/Users/Nature/Desktop/LiGroup/Filter_new_20190123/doc_in/'
            ],
        'MMCIF_FOLDER': 'C:/Users/Nature/Desktop/LiGroup/mmcif_files/',
        'COMMON_COL': ['_pdbx_audit_revision_history.revision_date', '_exptl.method', '_em_3d_reconstruction.resolution', '_refine.ls_d_res_high'],
        'ENTITY_COL': ['_entity.pdbx_mutation', '_entity.id'],
        'TYPE_COL':['_entity_poly.entity_id', '_entity_poly.pdbx_strand_id', '_entity_poly.type'],
        'SEQRES_COL':['_pdbx_poly_seq_scheme.pdb_strand_id',
                 '_pdbx_poly_seq_scheme.mon_id','_pdbx_poly_seq_scheme.pdb_mon_id', '_pdbx_poly_seq_scheme.auth_mon_id',
                 '_pdbx_poly_seq_scheme.ndb_seq_num', '_pdbx_poly_seq_scheme.pdb_seq_num',
                 '_pdbx_poly_seq_scheme.auth_seq_num', '_pdbx_poly_seq_scheme.pdb_ins_code'],
        'LIGAND_COL': [
                 '_struct_conn.ptnr2_auth_asym_id','_struct_conn.ptnr2_auth_comp_id',
                 '_struct_conn.ptnr2_auth_seq_id',
                 '_struct_conn.conn_type_id',
                 '_struct_conn.ptnr1_auth_asym_id', '_struct_conn.ptnr1_auth_comp_id',
                 '_struct_conn.ptnr1_auth_seq_id'],
        'LIGAND_LIST': [
                        'ZN', 'MG', 'CA', 'FE', 'NA', 'MN', 'K', 'NI', 'CU', 'CO', 'CD', 'HG', 'PT', 'MO', 'BE', 'AL', 'BA',
                        'RU', 'SR', 'V', 'CS', 'W', 'AU', 'YB', 'LI', 'GD', 'PB', 'Y', 'TL', 'IR', 'RB', 'SM', 'AG',
                        'OS', 'PR', 'PD', 'EU', 'RH', 'RE', 'TB', 'TA', 'LU', 'HO', 'CR', 'GA', 'LA', 'SN', 'SB', 'CE',
                        'ZR', 'ER', 'TH', 'TI', 'IN', 'HF', 'SC', 'DY', 'BI', 'PA', 'PU', 'AM', 'CM', 'CF', 'GE', 'NB', 'TC',
                        'ND', 'PM', 'TM', 'PO', 'FR', 'RA', 'AC', 'NP', 'BK', 'ES', 'FM', 'MD', 'NO', 'LR', 'RF', 'DB', 'SG'],
        'HEADERS': {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134'},

        }

    pdb_path_li = []

    def download_cif_file(pdbId, path):
        url = 'https://files.rcsb.org/view/%s.cif' % pdbId
        html = request.urlopen(url).read()
        html = html.decode('utf-8')
        with open(path, 'w') as fw:
            fw.write(html)
            time.sleep(2)

    def get_mmcif_file_path(self, pdbId, download=False):
        print('get_mmcif_file_path(): Working on [%s]' % pdbId)
        new_path  = '%s%s.cif' % (self.CONFIG['MMCIF_FOLDER'], pdbId)

        for path in self.CONFIG['MMCIF_OLD_FOLDER']:
            old_path = '%s%s.cif' % (path, pdbId)
            if os.path.exists(old_path):
                return old_path

        if os.path.exists(new_path):
            return new_path
        else:
            if download:
                MMCIF_unit.download_cif_file(pdbId, new_path)

            return new_path

    def check_mmcif_file(self, pdb_list):
        def find_unDownloaded_file(pdbId):
            for path in self.CONFIG['MMCIF_OLD_FOLDER']+[self.CONFIG['MMCIF_FOLDER']]:
                old_path = '%s%s.cif' % (path, pdbId)
                if os.path.exists(old_path):
                    MMCIF_unit.pdb_path_li.append(old_path)
                    return False
            return True

        unDownload = list(filter(find_unDownloaded_file, pdb_list))

        @retry(stop_max_attempt_number=3, wait_fixed=1000)
        def download_mmcif_file(pdbId):
            path = '%s%s.cif' % (MMCIF_unit.CONFIG['MMCIF_FOLDER'], pdbId)
            print('download_mmcif_file(): %s' % path)
            url = 'https://files.rcsb.org/view/%s.cif' % pdbId
            r = requests.get(url, headers=MMCIF_unit.CONFIG['HEADERS'])
            with open(path, 'wb+') as fw:
                fw.write(r.content)
                time.sleep(2)
                MMCIF_unit.pdb_path_li.append(path)

        pool = Pool(processes=20)
        pool.map(download_mmcif_file, unDownload)

    def get_mmcif_info(info_key, info_key_nli, info_dict, path):
        mmcif_dict = MMCIF2Dict(path)
        for key in info_key:

            if key in info_key_nli:
                data = mmcif_dict.get(key,np.nan)
                info_dict[key].append(data)
            else:
                data = mmcif_dict.get(key,[])
                if isinstance(data, str):
                    info_dict[key].append([data])
                    ## data = data.split(',')
                    # print(key)
                # info_dict[key].append(list(filter(lambda x :x not in '?.', data)))
                else:
                    info_dict[key].append(data)

    def get_data_from_mmcif(self, path_list, outputPath=False):
        '''
        {
            '_pdbx_audit_revision_history.revision_date': ['initial_version_time', 'newest_version_time'], # sometimes not a list
            '_entity.pdbx_mutation': ['mutation_num', 'mutation_content'], # sometimes not a list
            '_entity.id': ['entity_id_aidMuta'], # sometimes not a list
            ['_em_3d_reconstruction.resolution','_refine.ls_d_res_high']: ['resolution'], # not a lists
            '_exptl.method': ['method'], # not a list
        }

        '''
        info_dict = defaultdict(list)
        for path in path_list:
            if path[-3:] == 'cif':
                print(path)
                info_dict['pdb_id'].append(path[-8:-4])
                MMCIF_unit.get_mmcif_info(
                    MMCIF_unit.CONFIG['COMMON_COL'] + \
                    MMCIF_unit.CONFIG['ENTITY_COL'] + \
                    MMCIF_unit.CONFIG['TYPE_COL'] + \
                    MMCIF_unit.CONFIG['SEQRES_COL'] + \
                    MMCIF_unit.CONFIG['LIGAND_COL'],
                    MMCIF_unit.CONFIG['COMMON_COL'][1:],
                    info_dict,
                    path)

        # Deal with Residues in SEQRES_COL
        resides_col_li = MMCIF_unit.CONFIG['SEQRES_COL'][1:4]
        mtoTool = Unit.MultiToOne()
        for i in range(len(info_dict[resides_col_li[0]])):
            for resides_col in resides_col_li:
                info_dict[resides_col][i] = ''.join([mtoTool.multi_letter_convert_to_one_letter(j) for j in info_dict[resides_col][i]])

        get_index = lambda x, y, z: y[x[z]:x[z+1]] if len(x) != 1 and z+1 < len(x) else y[x[z]:]
        # Deal with SEQRES_COL
        pdbx_poly_key = MMCIF_unit.CONFIG['SEQRES_COL'][0]
        for i in range(len(info_dict[pdbx_poly_key])):
            strand_id_index = [0]
            li = info_dict[pdbx_poly_key][i]
            save_id = li[0]
            strand_id_li = [save_id]
            for j in range(len(li)):
                if li[j] != save_id:
                    save_id = li[j]
                    strand_id_index.append(j)
                    strand_id_li.append(save_id)
            info_dict[pdbx_poly_key][i] = strand_id_li

            for col in MMCIF_unit.CONFIG['SEQRES_COL'][1:4]:
                info_dict[col][i] = [
                    get_index(strand_id_index, info_dict[col][i], j)
                    for j in range(len(strand_id_index))]

            for col in MMCIF_unit.CONFIG['SEQRES_COL'][4:]:
                info_dict[col][i] = [';'.join(
                    get_index(strand_id_index, info_dict[col][i], j))
                    for j in range(len(strand_id_index))]

        # Deal with LIGAND_COL: Sort the data
        ligand_col_list = MMCIF_unit.CONFIG['LIGAND_COL']
        for i in range(len(info_dict[ligand_col_list[0]])):
            ligand_col_tp = tuple(info_dict[col][i] for col in ligand_col_list)
            ligand_col_zip_li = list(zip(*ligand_col_tp))
            ligand_col_zip_li.sort()
            for col_index in range(len(ligand_col_list)):
                info_dict[ligand_col_list[col_index]][i] = [tp[col_index] for tp in ligand_col_zip_li]
        # Deal with LIGAND_COL: Group the data
        ligand_group_col = ligand_col_list[0]
        new_ligand_col_li = ['%s_index'%ligand_group_col, '%s_li'%ligand_group_col]
        self.new_ligand_col_li = new_ligand_col_li
        for i in range(len(info_dict[ligand_group_col])):
            strand_id_index = [0]
            li = info_dict[ligand_group_col][i]
            if not li:
                info_dict[new_ligand_col_li[0]].append([])
                info_dict[new_ligand_col_li[1]].append([])
                continue
            save_id = li[0]
            strand_id_li = [save_id]
            for j in range(len(li)):
                if li[j] != save_id:
                    save_id = li[j]
                    strand_id_index.append(j)
                    strand_id_li.append(save_id)
            info_dict[new_ligand_col_li[1]].append(strand_id_li)
            info_dict[new_ligand_col_li[0]].append(strand_id_index)

            for col in ligand_col_list:
                info_dict[col][i] = [
                    get_index(strand_id_index, info_dict[col][i], j)
                    for j in range(len(strand_id_index))]

        df = pd.DataFrame(info_dict)
        # Deal with the date of structure
        df['initial_version_time'] = df.apply(lambda x: x[MMCIF_unit.CONFIG['COMMON_COL'][0]][0], axis=1)
        df['newest_version_time'] = df.apply(lambda x: x[MMCIF_unit.CONFIG['COMMON_COL'][0]][-1], axis=1)
        # Deal with the mutations
        muta_count = lambda x: x.count(',')+1 if x!= '?' else 0
        df['mutation_num'] = df.apply(lambda x: [muta_count(i) for i in x['_entity.pdbx_mutation']], axis=1)
        # Deal with the resolution
        df['resolution'] = df.apply(lambda x: x[MMCIF_unit.CONFIG['COMMON_COL'][3]] if x[MMCIF_unit.CONFIG['COMMON_COL'][1]]=='X-RAY DIFFRACTION' else x[MMCIF_unit.CONFIG['COMMON_COL'][2]], axis=1)
        # Change the columns
        df.rename(columns={MMCIF_unit.CONFIG['COMMON_COL'][1]:'method'},inplace=True)
        df.drop(columns=[MMCIF_unit.CONFIG['COMMON_COL'][0],MMCIF_unit.CONFIG['COMMON_COL'][2],MMCIF_unit.CONFIG['COMMON_COL'][3]],inplace=True)

        if os.path.exists(outputPath):
            self.file_o(outputPath, df, mode='a+',header=False)
        else:
            self.file_o(outputPath, df)
        return df

    def handle_mmcif_df(self, dfrm, outputPath=False):
        def get_sub_df(df, i, spe_col_li, common_col_li):
            try:
                a = pd.DataFrame({key: df.loc[i,key] for key in spe_col_li})
            except Exception as e:
                print(pdb, e)
                a = pd.DataFrame({key: [df.loc[i,key]] for key in spe_col_li})

            for common_col in common_col_li:
                a[common_col] = df.loc[i, common_col]
            return a

        def sub_handle_df(df, spe_col_li, common_col_li):
            df_li = []
            for i in df.index:
                df_li.append(get_sub_df(df, i, spe_col_li, common_col_li))
            return pd.concat(df_li, ignore_index=True)

        entity_poly_df = sub_handle_df(dfrm, MMCIF_unit.CONFIG['ENTITY_COL']+['mutation_num'], ['pdb_id'])
        type_poly_df = sub_handle_df(dfrm, MMCIF_unit.CONFIG['TYPE_COL'], ['pdb_id'])
        basic_df = sub_handle_df(dfrm, MMCIF_unit.CONFIG['SEQRES_COL'], ['pdb_id', 'method', 'initial_version_time', 'newest_version_time', 'resolution'])
        ligand_df = sub_handle_df(dfrm, MMCIF_unit.CONFIG['LIGAND_COL']+self.new_ligand_col_li, ['pdb_id'])

        new_type_poly_df = type_poly_df.drop(MMCIF_unit.CONFIG['TYPE_COL'][1], axis=1).join(type_poly_df[MMCIF_unit.CONFIG['TYPE_COL'][1]].str.split(',', expand=True).stack().reset_index(level=1, drop=True).rename('chain_id'))

        entity_poly_df.rename(columns={'_entity.pdbx_mutation': 'mutation_content', '_entity.id': 'entity_id'}, inplace=True)
        new_type_poly_df.rename(columns={'_entity_poly.entity_id': 'entity_id', '_entity_poly.type': 'protein_type'}, inplace=True)
        basic_df.rename(columns={'_pdbx_poly_seq_scheme.pdb_strand_id':'chain_id'}, inplace=True)
        ligand_df.rename(columns={self.new_ligand_col_li[1]:'chain_id'}, inplace=True)

        df_1 = pd.merge(basic_df, ligand_df, how='left')
        df_2 = pd.merge(new_type_poly_df, df_1, how='left')
        df_3 = pd.merge(df_2, entity_poly_df, how='left')

        if os.path.exists(outputPath):
            self.file_o(outputPath, df_3, mode='a+',header=False)
        else:
            self.file_o(outputPath, df_3)
        return df_3

    def script_fun(self, pdb_list, outputPath_li, chunksize=100):
        for i in range(0, len(pdb_list), chunksize):
            chunk_li = pdb_list[i:i+chunksize]
            print(chunk_li)
            chunk_df = self.get_data_from_mmcif(chunk_li, outputPath=outputPath_li[0])
            self.handle_mmcif_df(chunk_df, outputPath=outputPath_li[1])

In [13]:
route = 'C:\\Users\\Nature\\Desktop\\LiGroup\\Filter_new_20190123\\doc_in\\'
file_list = os.listdir(route)
file_p_list = [route+i for i in file_list]
mmcif_demo = MMCIF_unit()
# df = mmcif_demo.get_date_muta_resolution_method(file_p_list)
df = mmcif_demo.get_data_from_mmcif(file_p_list)
df_new = mmcif_demo.handle_mmcif_df(df)

C:\Users\Nature\Desktop\LiGroup\Filter_new_20190123\doc_in\1a5r.cif
C:\Users\Nature\Desktop\LiGroup\Filter_new_20190123\doc_in\1abn.cif
C:\Users\Nature\Desktop\LiGroup\Filter_new_20190123\doc_in\1aii.cif
C:\Users\Nature\Desktop\LiGroup\Filter_new_20190123\doc_in\1aye.cif
C:\Users\Nature\Desktop\LiGroup\Filter_new_20190123\doc_in\1dfv.cif
C:\Users\Nature\Desktop\LiGroup\Filter_new_20190123\doc_in\2xyn.cif
C:\Users\Nature\Desktop\LiGroup\Filter_new_20190123\doc_in\3azm.cif
C:\Users\Nature\Desktop\LiGroup\Filter_new_20190123\doc_in\3g8t.cif
C:\Users\Nature\Desktop\LiGroup\Filter_new_20190123\doc_in\3g96.cif
C:\Users\Nature\Desktop\LiGroup\Filter_new_20190123\doc_in\6d7s.cif
C:\Users\Nature\Desktop\LiGroup\Filter_new_20190123\doc_in\6iwg.cif
C:\Users\Nature\Desktop\LiGroup\Filter_new_20190123\doc_in\6oon.cif


In [15]:
entity_poly_df,new_type_poly_df,basic_df,ligand_df = df_new

In [29]:
# df_1 = pd.merge(basic_df, ligand_df, how='left')
# df_2 = pd.merge(new_type_poly_df, df_1, how='left')
df_3 = pd.merge(df_2, entity_poly_df, how='left')
df_3

Unnamed: 0,entity_id,protein_type,pdb_id,chain_id,_pdbx_poly_seq_scheme.mon_id,_pdbx_poly_seq_scheme.pdb_mon_id,_pdbx_poly_seq_scheme.auth_mon_id,_pdbx_poly_seq_scheme.ndb_seq_num,_pdbx_poly_seq_scheme.pdb_seq_num,_pdbx_poly_seq_scheme.auth_seq_num,...,_struct_conn.ptnr2_auth_asym_id,_struct_conn.ptnr2_auth_comp_id,_struct_conn.ptnr2_auth_seq_id,_struct_conn.conn_type_id,_struct_conn.ptnr1_auth_asym_id,_struct_conn.ptnr1_auth_comp_id,_struct_conn.ptnr1_auth_seq_id,_struct_conn.ptnr2_auth_asym_id_index,mutation_content,mutation_num
0,1,polypeptide(L),1a5r,A,GSMSDQEAKPSTEDLGDKKEGEYIKLKVIGQDSSEIHFKVKMTTHL...,GSMSDQEAKPSTEDLGDKKEGEYIKLKVIGQDSSEIHFKVKMTTHL...,GSMSDQEAKPSTEDLGDKKEGEYIKLKVIGQDSSEIHFKVKMTTHL...,1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,-1;0;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17...,-1;0;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17...,...,,,,,,,,,?,0
1,1,polypeptide(L),1abn,A,ASRLLLNNGAKMPILGLGTWKSPPGQVTEAVKVAIDVGYRHIDCAH...,?SRLLLNNGAKMPILGLGTWKSPPGQVTEAVKVAIDVGYRHIDCAH...,?SRLLLNNGAKMPILGLGTWKSPPGQVTEAVKVAIDVGYRHIDCAH...,1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,?;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,...,,,,,,,,,?,0
2,1,polypeptide(L),1aii,A,MASIWVGHRGTVRDYPDFSPSVDAEAIQKAIRGIGTDEKMLISILT...,?ASIWVGHRGTVRDYPDFSPSVDAEAIQKAIRGIGTDEKMLISILT...,?ASIWVGHRGTVRDYPDFSPSVDAEAIQKAIRGIGTDEKMLISILT...,1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;19;...,?;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;19;...,...,"[A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, ...","[ARG, ASP, ASP, ETA, GLU, GLU, GLU, GLU, GLU, ...","[190, 76, 76, 360, 148, 195, 195, 232, 232, 23...","[metalc, metalc, metalc, metalc, metalc, metal...","[A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, ...","[CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, C...","[353, 351, 351, 355, 357, 355, 355, 353, 353, ...",0.0,?,0
3,1,polypeptide(L),1aye,A,LETFVGDQVLEIVPSNEEQIKNLLQLEAQEHLQLDFWKSPTTPGET...,LETFVGDQVLEIVPSNEEQIKNLLQLEAQEHLQLDFWKSPTTPGET...,LETFVGDQVLEIVPSNEEQIKNLLQLEAQEHLQLDFWKSPTTPGET...,1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;19;20;2...,4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;19;20;2...,...,"[A, A, A, A, A, A, A]","[CYS, CYS, GLU, GLU, HIS, HIS, HOH]","[161, 244, 72, 72, 196, 69, 401]","[disulf, disulf, metalc, metalc, metalc, metal...","[A, A, A, A, A, A, A]","[CYS, CYS, ZN, ZN, ZN, ZN, ZN]","[138, 210, 400, 400, 400, 400, 400]",0.0,?,0
4,1,polypeptide(L),1dfv,A,QDSTSDLIPAPPLSKVPLQQNFQDNQFQGKWYVVGLAGNAILREDK...,????SDLIPAPPLSKVPLQQNFQDNQFQGKWYVVGLAGNAILREDK...,????SDLIPAPPLSKVPLQQNFQDNQFQGKWYVVGLAGNAILREDK...,1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,?;?;?;?;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,...,"[A, A, A]","[CYS, NAG, NAG]","[175, 179, 180]","[disulf, covale, covale]","[A, A, A]","[CYS, ASN, NAG]","[76, 65, 179]",0.0,?,0
5,1,polypeptide(L),1dfv,B,QDSTSDLIPAPPLSKVPLQQNFQDNQFQGKWYVVGLAGNAILREDK...,???TSDLIPAPPLSKVPLQQNFQDNQFQGKWYVVGLAGNAILREDK...,???TSDLIPAPPLSKVPLQQNFQDNQFQGKWYVVGLAGNAILREDK...,1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,?;?;?;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,...,"[B, B]","[CYS, NDG]","[175, 178]","[disulf, covale]","[B, B]","[CYS, ASN]","[76, 65]",3.0,?,0
6,1,polypeptide(L),2xyn,A,MGHHHHHHSSGVDLGTENLYFQSMDKWEMERTDITMKHKLGGGQYG...,?????????????????????????KWEMERTDITMKHKLGGGQYG...,?????????????????????????KWEMERTDITMKHKLGGGQYG...,1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,255;256;257;258;259;260;261;262;263;264;265;26...,?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;...,...,"[A, A, A, A, A]","[ARG, HOH, HOH, ILE, THR]","[519, 2004, 2006, 449, 452]","[metalc, metalc, metalc, metalc, metalc]","[A, A, A, A, A]","[NA, NA, NA, NA, NA]","[549, 1, 549, 1, 1]",0.0,?,0
7,1,polypeptide(L),2xyn,B,MGHHHHHHSSGVDLGTENLYFQSMDKWEMERTDITMKHKLGGGQYG...,?????????????????????????KWEMERTDITMKHKLGGGQYG...,?????????????????????????KWEMERTDITMKHKLGGGQYG...,1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,255;256;257;258;259;260;261;262;263;264;265;26...,?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;...,...,,,,,,,,,?,0
8,1,polypeptide(L),2xyn,C,MGHHHHHHSSGVDLGTENLYFQSMDKWEMERTDITMKHKLGGGQYG...,??????????????????????????WEMERTDITMKHKLGGGQYG...,??????????????????????????WEMERTDITMKHKLGGGQYG...,1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,255;256;257;258;259;260;261;262;263;264;265;26...,?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;...,...,,,,,,,,,?,0
9,1,polypeptide(L),3azm,A,GSHMARTKQTARKSTGGKAPRKQLATKAARKSAPATGGVKKPHRYR...,?????????????????????????????????????????PHRYR...,?????????????????????????????????????????PHRYR...,1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,-3;-2;-1;0;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15...,?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;...,...,,,,,,,,,?,0


In [30]:
df_3.isnull().sum()

entity_id                                 0
protein_type                              0
pdb_id                                    0
chain_id                                  0
_pdbx_poly_seq_scheme.mon_id              0
_pdbx_poly_seq_scheme.pdb_mon_id          0
_pdbx_poly_seq_scheme.auth_mon_id         0
_pdbx_poly_seq_scheme.ndb_seq_num         0
_pdbx_poly_seq_scheme.pdb_seq_num         0
_pdbx_poly_seq_scheme.auth_seq_num        0
_pdbx_poly_seq_scheme.pdb_ins_code        0
method                                    0
initial_version_time                      0
newest_version_time                       0
resolution                                1
_struct_conn.ptnr2_auth_asym_id          25
_struct_conn.ptnr2_auth_comp_id          25
_struct_conn.ptnr2_auth_seq_id           25
_struct_conn.conn_type_id                25
_struct_conn.ptnr1_auth_asym_id          25
_struct_conn.ptnr1_auth_comp_id          25
_struct_conn.ptnr1_auth_seq_id           25
_struct_conn.ptnr2_auth_asym_id_

In [36]:
df[['_pdbx_poly_seq_scheme.pdb_strand_id']]

Unnamed: 0,_pdbx_poly_seq_scheme.pdb_strand_id
0,[A]
1,[A]
2,[A]
3,[A]
4,"[A, B]"
5,"[A, B, C]"
6,"[A, B, C, D, E, F, G, H, I, J]"
7,"[A, E, P, B, F, Q, C, G, R, D, H, S]"
8,"[A, E, P, B, F, Q, C, G, R, D, H, S]"
9,"[A, B, C, D]"


In [8]:
df.loc[6,'_entity_poly.type']

['polypeptide(L)',
 'polypeptide(L)',
 'polypeptide(L)',
 'polypeptide(L)',
 'polydeoxyribonucleotide']

In [9]:
df.loc[6,'_entity_poly.pdbx_strand_id']

['A,E', 'B,F', 'C,G', 'D,H', 'I,J']

In [69]:
df.loc[0,'_entity_poly.entity_id']

['1']

In [17]:
df.loc[6,'_pdbx_poly_seq_scheme.pdb_ins_code']

['A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;B;C;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.']

In [104]:
def getEntityDf(df, i):
    entity_poly_col_li = ['_entity.pdbx_mutation', 'mutation_num', '_entity.id']
    pdb = df.loc[i, 'pdb_id']
    
    try:
        a = pd.DataFrame({key: df.loc[i,key] for key in entity_poly_col_li})
    except Exception as e:
        print(pdb, e)
        a = pd.DataFrame({key: [df.loc[i,key]] for key in entity_poly_col_li})
    a['pdb_id'] = pdb
    return a

entity_df_li = []
for i in df.index:
    entity_df_li.append(getEntityDf(df, i))
entity_poly_df = pd.concat(entity_df_li, ignore_index=True)

entity_poly_df

Unnamed: 0,_entity.pdbx_mutation,mutation_num,_entity.id,pdb_id
0,?,0,1,1a5r
1,?,0,1,1abn
2,?,0,2,1abn
3,?,0,1,1aii
4,?,0,2,1aii
5,?,0,3,1aii
6,?,0,4,1aii
7,?,0,5,1aii
8,?,0,1,1aye
9,?,0,2,1aye


In [102]:
def getTypeDf(df, i):
    type_poly_col_li = ['_entity_poly.entity_id', '_entity_poly.pdbx_strand_id' , '_entity_poly.type']
    pdb = df.loc[i, 'pdb_id']
    
    try:
        a = pd.DataFrame({key: df.loc[i,key] for key in type_poly_col_li})
    except Exception as e:
        print(pdb, e)
        a = pd.DataFrame({key: [df.loc[i,key]] for key in type_poly_col_li})
    a['pdb_id'] = pdb
    return a

type_df_li = []
for i in df.index:
    type_df_li.append(getTypeDf(df, i))
type_poly_df = pd.concat(type_df_li, ignore_index=True)

new_type_poly_df = type_poly_df.drop('_entity_poly.pdbx_strand_id', axis=1).join(type_poly_df['_entity_poly.pdbx_strand_id'].str.split(',', expand=True).stack().reset_index(level=1, drop=True).rename('chain_id'))

In [103]:
new_type_poly_df

Unnamed: 0,_entity_poly.entity_id,_entity_poly.type,pdb_id,chain_id
0,1,polypeptide(L),1a5r,A
1,1,polypeptide(L),1abn,A
2,1,polypeptide(L),1aii,A
3,1,polypeptide(L),1aye,A
4,1,polypeptide(L),1dfv,A
4,1,polypeptide(L),1dfv,B
5,1,polypeptide(L),2xyn,A
5,1,polypeptide(L),2xyn,B
5,1,polypeptide(L),2xyn,C
6,1,polypeptide(L),3azm,A


In [94]:
def getBasicDfrm(df, i):
    common_col_list = ['pdb_id', 'method', 'initial_version_time', 'newest_version_time', 'resolution']
    spe_col_list = ['_pdbx_poly_seq_scheme.mon_id', '_pdbx_poly_seq_scheme.ndb_seq_num',
                             '_pdbx_poly_seq_scheme.pdb_seq_num', '_pdbx_poly_seq_scheme.auth_seq_num',
                             '_pdbx_poly_seq_scheme.pdb_mon_id', '_pdbx_poly_seq_scheme.auth_mon_id',
                             '_pdbx_poly_seq_scheme.pdb_strand_id', '_pdbx_poly_seq_scheme.pdb_ins_code']
    
    a = pd.DataFrame({key: df.loc[i, key] for key in spe_col_list})
    for common_col in common_col_list:
        a[common_col] = df.loc[i, common_col]
    
    return a

df_li = []
for i in df.index:
    df_li.append(getBasicDfrm(df,i))

basic_dfrm = pd.concat(df_li,ignore_index=True)

In [95]:
basic_dfrm

Unnamed: 0,_pdbx_poly_seq_scheme.mon_id,_pdbx_poly_seq_scheme.ndb_seq_num,_pdbx_poly_seq_scheme.pdb_seq_num,_pdbx_poly_seq_scheme.auth_seq_num,_pdbx_poly_seq_scheme.pdb_mon_id,_pdbx_poly_seq_scheme.auth_mon_id,_pdbx_poly_seq_scheme.pdb_strand_id,_pdbx_poly_seq_scheme.pdb_ins_code,pdb_id,method,initial_version_time,newest_version_time,resolution
0,GSMSDQEAKPSTEDLGDKKEGEYIKLKVIGQDSSEIHFKVKMTTHL...,1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,-1;0;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17...,-1;0;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17...,GSMSDQEAKPSTEDLGDKKEGEYIKLKVIGQDSSEIHFKVKMTTHL...,GSMSDQEAKPSTEDLGDKKEGEYIKLKVIGQDSSEIHFKVKMTTHL...,A,.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;...,1a5r,SOLUTION NMR,1998-10-14,2011-07-13,
1,ASRLLLNNGAKMPILGLGTWKSPPGQVTEAVKVAIDVGYRHIDCAH...,1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,?;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,?SRLLLNNGAKMPILGLGTWKSPPGQVTEAVKVAIDVGYRHIDCAH...,?SRLLLNNGAKMPILGLGTWKSPPGQVTEAVKVAIDVGYRHIDCAH...,A,.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;...,1abn,X-RAY DIFFRACTION,1994-01-31,2017-11-29,2.4
2,MASIWVGHRGTVRDYPDFSPSVDAEAIQKAIRGIGTDEKMLISILT...,1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;19;...,?;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;19;...,?ASIWVGHRGTVRDYPDFSPSVDAEAIQKAIRGIGTDEKMLISILT...,?ASIWVGHRGTVRDYPDFSPSVDAEAIQKAIRGIGTDEKMLISILT...,A,.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;...,1aii,X-RAY DIFFRACTION,1997-03-12,2011-07-13,1.95
3,LETFVGDQVLEIVPSNEEQIKNLLQLEAQEHLQLDFWKSPTTPGET...,1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;19;20;2...,4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;19;20;2...,LETFVGDQVLEIVPSNEEQIKNLLQLEAQEHLQLDFWKSPTTPGET...,LETFVGDQVLEIVPSNEEQIKNLLQLEAQEHLQLDFWKSPTTPGET...,A,A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;...,1aye,X-RAY DIFFRACTION,1999-01-13,2011-07-13,1.8
4,QDSTSDLIPAPPLSKVPLQQNFQDNQFQGKWYVVGLAGNAILREDK...,1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,?;?;?;?;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,????SDLIPAPPLSKVPLQQNFQDNQFQGKWYVVGLAGNAILREDK...,????SDLIPAPPLSKVPLQQNFQDNQFQGKWYVVGLAGNAILREDK...,A,.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;...,1dfv,X-RAY DIFFRACTION,2000-03-06,2011-11-16,2.6
5,QDSTSDLIPAPPLSKVPLQQNFQDNQFQGKWYVVGLAGNAILREDK...,1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,?;?;?;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,???TSDLIPAPPLSKVPLQQNFQDNQFQGKWYVVGLAGNAILREDK...,???TSDLIPAPPLSKVPLQQNFQDNQFQGKWYVVGLAGNAILREDK...,B,.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;...,1dfv,X-RAY DIFFRACTION,2000-03-06,2011-11-16,2.6
6,MGHHHHHHSSGVDLGTENLYFQSMDKWEMERTDITMKHKLGGGQYG...,1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,255;256;257;258;259;260;261;262;263;264;265;26...,?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;...,?????????????????????????KWEMERTDITMKHKLGGGQYG...,?????????????????????????KWEMERTDITMKHKLGGGQYG...,A,.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;...,2xyn,X-RAY DIFFRACTION,2010-12-01,2019-04-03,2.81
7,MGHHHHHHSSGVDLGTENLYFQSMDKWEMERTDITMKHKLGGGQYG...,1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,255;256;257;258;259;260;261;262;263;264;265;26...,?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;...,?????????????????????????KWEMERTDITMKHKLGGGQYG...,?????????????????????????KWEMERTDITMKHKLGGGQYG...,B,.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;...,2xyn,X-RAY DIFFRACTION,2010-12-01,2019-04-03,2.81
8,MGHHHHHHSSGVDLGTENLYFQSMDKWEMERTDITMKHKLGGGQYG...,1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,255;256;257;258;259;260;261;262;263;264;265;26...,?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;...,??????????????????????????WEMERTDITMKHKLGGGQYG...,??????????????????????????WEMERTDITMKHKLGGGQYG...,C,.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;...,2xyn,X-RAY DIFFRACTION,2010-12-01,2019-04-03,2.81
9,GSHMARTKQTARKSTGGKAPRKQLATKAARKSAPATGGVKKPHRYR...,1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,-3;-2;-1;0;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15...,?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;?;...,?????????????????????????????????????????PHRYR...,?????????????????????????????????????????PHRYR...,A,.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;.;...,3azm,X-RAY DIFFRACTION,2011-09-21,2012-08-15,2.89


In [107]:
def getLigandDf(df, i):
    spe_col_list = ['_struct_conn.conn_type_id','_struct_conn.ptnr1_auth_asym_id',
                             '_struct_conn.ptnr1_auth_comp_id','_struct_conn.ptnr1_auth_seq_id',
                             '_struct_conn.ptnr2_auth_asym_id','_struct_conn.ptnr2_auth_comp_id',
                             '_struct_conn.ptnr2_auth_seq_id','_struct_conn.ptnr1_auth_asym_id_index',
                             '_struct_conn.ptnr1_auth_asym_id_li']

    a = pd.DataFrame({key: df.loc[i,key] for key in spe_col_list})
    a['pdb_id'] = df.loc[i, 'pdb_id']
    return a

ligand_df_li = []
for i in df.index:
    ligand_df_li.append(getLigandDf(df,i))

ligand_dfrm = pd.concat(ligand_df_li,ignore_index=True)
ligand_dfrm

Unnamed: 0,_struct_conn.conn_type_id,_struct_conn.ptnr1_auth_asym_id,_struct_conn.ptnr1_auth_comp_id,_struct_conn.ptnr1_auth_seq_id,_struct_conn.ptnr2_auth_asym_id,_struct_conn.ptnr2_auth_comp_id,_struct_conn.ptnr2_auth_seq_id,_struct_conn.ptnr1_auth_asym_id_index,_struct_conn.ptnr1_auth_asym_id_li,pdb_id
0,"[metalc, metalc, metalc, metalc, metalc, metal...","[A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, ...","[CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, C...","[355, 351, 351, 351, 351, 351, 353, 353, 353, ...","[A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, ...","[ETA, GLY, ILE, GLY, ASP, ASP, GLU, GLY, GLY, ...","[360, 34, 32, 36, 76, 76, 232, 192, 187, 190, ...",0.0,A,1aii
1,"[disulf, disulf, metalc, metalc, metalc, metal...","[A, A, A, A, A, A, A]","[CYS, CYS, ZN, ZN, ZN, ZN, ZN]","[138, 210, 400, 400, 400, 400, 400]","[A, A, A, A, A, A, A]","[CYS, CYS, HIS, GLU, GLU, HIS, HOH]","[161, 244, 69, 72, 72, 196, 401]",0.0,A,1aye
2,[disulf],[A],[CYS],[76],[A],[CYS],[175],0.0,A,1dfv
3,[disulf],[B],[CYS],[76],[B],[CYS],[175],1.0,B,1dfv
4,"[covale, covale]","[A, A]","[NAG, ASN]","[179, 65]","[A, A]","[NAG, NAG]","[180, 179]",2.0,A,1dfv
5,[covale],[B],[ASN],[65],[B],[NDG],[178],4.0,B,1dfv
6,"[metalc, metalc, metalc, metalc, metalc]","[A, A, A, A, A]","[NA, NA, NA, NA, NA]","[1, 1, 1, 549, 549]","[A, A, A, A, A]","[ILE, THR, HOH, HOH, ARG]","[449, 452, 2004, 2006, 519]",0.0,A,2xyn
7,"[metalc, metalc, metalc]","[I, I, I]","[DG, DA, DG]","[121, 133, 100]","[I, I, I]","[MN, MN, MN]","[1002, 1003, 1001]",0.0,I,3azm
8,[metalc],[E],[ASP],[77],[E],[MN],[1001],3.0,E,3azm
9,"[metalc, metalc]","[J, J]","[DG, DG]","[217, 280]","[J, J]","[MN, MN]","[1003, 1002]",4.0,J,3azm


In [120]:
def getLigandDf(df, i):
    spe_col_list = ['_struct_conn.conn_type_id','_struct_conn.ptnr1_auth_asym_id',
                             '_struct_conn.ptnr1_auth_comp_id','_struct_conn.ptnr1_auth_seq_id',
                             '_struct_conn.ptnr2_auth_asym_id','_struct_conn.ptnr2_auth_comp_id',
                             '_struct_conn.ptnr2_auth_seq_id','_struct_conn.ptnr2_auth_asym_id_index',
                             '_struct_conn.ptnr2_auth_asym_id_li']

    a = pd.DataFrame({key: df.loc[i,key] for key in spe_col_list})
    a['pdb_id'] = df.loc[i, 'pdb_id']
    return a

ligand_df_li = []
for i in df.index:
    ligand_df_li.append(getLigandDf(df,i))

ligand_dfrm = pd.concat(ligand_df_li,ignore_index=True)
ligand_dfrm

Unnamed: 0,_struct_conn.conn_type_id,_struct_conn.ptnr1_auth_asym_id,_struct_conn.ptnr1_auth_comp_id,_struct_conn.ptnr1_auth_seq_id,_struct_conn.ptnr2_auth_asym_id,_struct_conn.ptnr2_auth_comp_id,_struct_conn.ptnr2_auth_seq_id,_struct_conn.ptnr2_auth_asym_id_index,_struct_conn.ptnr2_auth_asym_id_li,pdb_id
0,"[metalc, metalc, metalc, metalc, metalc, metal...","[A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, ...","[CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, C...","[353, 351, 351, 355, 357, 355, 355, 353, 353, ...","[A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, ...","[ARG, ASP, ASP, ETA, GLU, GLU, GLU, GLU, GLU, ...","[190, 76, 76, 360, 148, 195, 195, 232, 232, 23...",0.0,A,1aii
1,"[disulf, disulf, metalc, metalc, metalc, metal...","[A, A, A, A, A, A, A]","[CYS, CYS, ZN, ZN, ZN, ZN, ZN]","[138, 210, 400, 400, 400, 400, 400]","[A, A, A, A, A, A, A]","[CYS, CYS, GLU, GLU, HIS, HIS, HOH]","[161, 244, 72, 72, 196, 69, 401]",0.0,A,1aye
2,"[disulf, covale, covale]","[A, A, A]","[CYS, ASN, NAG]","[76, 65, 179]","[A, A, A]","[CYS, NAG, NAG]","[175, 179, 180]",0.0,A,1dfv
3,"[disulf, covale]","[B, B]","[CYS, ASN]","[76, 65]","[B, B]","[CYS, NDG]","[175, 178]",3.0,B,1dfv
4,"[metalc, metalc, metalc, metalc, metalc]","[A, A, A, A, A]","[NA, NA, NA, NA, NA]","[549, 1, 549, 1, 1]","[A, A, A, A, A]","[ARG, HOH, HOH, ILE, THR]","[519, 2004, 2006, 449, 452]",0.0,A,2xyn
5,[metalc],[E],[ASP],[77],[E],[MN],[1001],0.0,E,3azm
6,"[metalc, metalc, metalc, metalc]","[I, I, I, I]","[DG, DG, DG, DA]","[100, 100, 121, 133]","[I, I, I, I]","[MN, MN, MN, MN]","[1001, 1001, 1002, 1003]",1.0,I,3azm
7,"[hydrog, hydrog, hydrog, hydrog, hydrog, hydro...","[I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, ...","[DT, DT, DT, DT, DT, DT, DT, DT, DC, DT, DT, D...","[143, 143, 142, 142, 140, 140, 136, 136, 129, ...","[J, J, J, J, J, J, J, J, J, J, J, J, J, J, J, ...","[DA, DA, DA, DA, DA, DA, DA, DA, DA, DA, DA, D...","[150, 150, 151, 151, 153, 153, 157, 157, 163, ...",5.0,J,3azm
8,"[covale, covale]","[E, E]","[A, A2M]","[-1, 0]","[E, E]","[A2M, G]","[0, 1]",0.0,E,3g8t
9,"[hydrog, hydrog, covale, covale, metalc]","[E, P, F, F, F]","[C, G, A, A2M, C]","[2, 56, -1, 0, 2]","[F, F, F, F, F]","[A, A, A2M, G, MG]","[-1, -1, 0, 1, 13]",2.0,F,3g8t


In [121]:
ligand_dfrm.loc[25,]

_struct_conn.conn_type_id                [disulf, metalc, metalc, metalc, metalc, metal...
_struct_conn.ptnr1_auth_asym_id                                   [B, B, B, B, B, A, B, B]
_struct_conn.ptnr1_auth_comp_id                        [CYS, NA, NA, NA, NA, NA, HIS, LEU]
_struct_conn.ptnr1_auth_seq_id                       [25, 101, 101, 101, 101, 301, 84, 87]
_struct_conn.ptnr2_auth_asym_id                                   [B, B, B, B, B, B, B, B]
_struct_conn.ptnr2_auth_comp_id                     [CYS, HOH, HOH, HOH, HOH, HOH, NA, NA]
_struct_conn.ptnr2_auth_seq_id                     [80, 209, 255, 278, 287, 294, 101, 101]
_struct_conn.ptnr2_auth_asym_id_index                                                   17
_struct_conn.ptnr2_auth_asym_id_li                                                       B
pdb_id                                                                                6iwg
Name: 25, dtype: object

#### Comon Info
```pdb_id, method, initial_version_time, newest_version_time, resolution```

#### Mutation (All chains, All Entities)
```_entity.pdbx_mutation``` $\rightarrow$ ```mutation_num``` + ```_entity.id```

#### Type (All chains but have to split, Some Entities)
```_entity_poly.entity_id``` + ```_entity_poly.pdbx_strand_id``` + ```_entity_poly.type```

#### SEQRES (All chains, Some Entities)

```
_pdbx_poly_seq_scheme.mon_id
_pdbx_poly_seq_scheme.ndb_seq_num
_pdbx_poly_seq_scheme.pdb_seq_num
_pdbx_poly_seq_scheme.auth_seq_num
_pdbx_poly_seq_scheme.pdb_mon_id
_pdbx_poly_seq_scheme.auth_mon_id
_pdbx_poly_seq_scheme.pdb_strand_id # (All Chains)
_pdbx_poly_seq_scheme.pdb_ins_code
```

#### Ligand (Some chains)
```
_struct_conn.conn_type_id
_struct_conn.ptnr1_auth_asym_id
_struct_conn.ptnr1_auth_comp_id
_struct_conn.ptnr1_auth_seq_id
_struct_conn.ptnr2_auth_asym_id
_struct_conn.ptnr2_auth_comp_id
_struct_conn.ptnr2_auth_seq_id
_struct_conn.ptnr1_auth_asym_id_index
_struct_conn.ptnr1_auth_asym_id_li
```

In [63]:
mmcif_demo = MMCIF_unit()
mmcif_demo.script_fun(['2xyn', '5js2','6oon','6d7s'], ['./', './'])

['2xyn', '5js2', '6oon', '6d7s']
Empty DataFrame
Columns: [_pdbx_poly_seq_scheme.mon_id, _pdbx_poly_seq_scheme.pdb_strand_id, _struct_conn.ptnr2_auth_asym_id]
Index: []


ValueError: Cannot set a frame with no defined index and a value that cannot be converted to a Series

download_mmcif_file(): C:/Users/Nature/Desktop/LiGroup/Filter_new_20190123/doc_in/5wts.cif
