# Muta3DMaps(v 1.0) Usage

## Import Packages

In [1]:
import sys
sys.path.append("/data/zzf/Work/SIFTS_Plus_Muta_Maps/src/py_src/")
import pandas as pd
import numpy as np
import os
from RetrievePDB import *
from UniProt_unit import UniProt_unit
from MMCIFplus import *
from SIFTS_unit import SIFTS_unit

## 1.	建立数据集

### 1.1 UniProt ID Mapping数据集的创建

在本软件的适用任务中，常有需要将序列上的突变位点信息与三维结构数据里的位点对应上。当序列的ID标识符不是UniProt数据库的UniProt ID时，例如RefSeq的转录本ID标识符，就需要将这类ID标识符先进行转换。

#### 1.1.1 输入数据样例

下面使用范例数据进行演示：

In [2]:
# 所有的文件IO都在该文件夹下进行
folder = "/home/zzf/Work/demo_files/Muta3dMaps/"
demo_file = folder + "demo_muta.tsv"
demo_df = pd.read_csv(demo_file, sep='\t')
demo_df.head()

Unnamed: 0,mutation_unp,GENE,RefSeq_protein
0,K45E,SAMD11,NP_689699
1,P293A,SAMD11,NP_689699
2,G76S,AGRN,NP_940978
3,N105I,AGRN,NP_940978
4,A375S,AGRN,NP_940978


> 在此范例数据中，位点信息依附于特定RefSeq ID标识符上，且有对应基因

#### 1.1.2	目标列及对应信息

依据范例数据的情况来指定下列参数:
* ```id_col```: RefSeq_protein
* ```id_type```: Refseq Protein -> P_REFSEQ_AC
* ```muta_col```: mutation_unp
* ```gene_col```: GENE
* ```muta_type```: mutation in UniProt Site

In [3]:
# specify column names
id_col = 'RefSeq_protein'
id_type = 'P_REFSEQ_AC'
muta_col = 'mutation_unp'
gene_col = 'GENE'

# Necessary Columns for ID Mapping
usecols = ['id', 'genes', 'reviewed', 'comment(ALTERNATIVE%20PRODUCTS)', 'organism', 'protein%20names']  

# Report of Data Processing
reportPath = folder + 'Report.txt' 
# OutPut File of ID Mapping (RAW)
rawOutputPath = folder + 'rawMapping.tsv' 
# OutPut File of ID Mapping (Final Result)
handledOutputPath = folder + 'modifiedMapping.tsv'  

# Filtering data
constraint_dict = {
    "GENE_status": (False, "ne"),  # 'ne' for !=
    "Status": ("reviewed", "eq"),  # 'eq' for ==
    "unp_map_tage": ("Untrusted & No Isoform", "ne")
}

#### 1.1.3 获取数据

In [4]:
# 初始化
unp_demo = UniProt_unit(demo_df, id_col, id_type, usecols, reportPath, muta_col=muta_col, gene_col=gene_col)
# Return True if get RAW Result Successfully 
unp_demo.get_raw_ID_Mapping(rawOutputPath)
# Deal with different situations 
handled_ID_Mapping = unp_demo.handle_ID_Mapping() 
# Add Gene Status 
unp_demo.getGeneStatus(handled_ID_Mapping)
# Label Mapping Status 
unp_demo.label_mapping_status(handled_ID_Mapping, constraint_dict) 
# close the file-handle of report 
unp_demo.report.close() 
# Output the final result 
handled_ID_Mapping.to_csv(handledOutputPath, sep='\t', index=False) 

#### 1.1.4 样例输出

In [5]:
handled_ID_Mapping

Unnamed: 0,Entry,Gene names,Status,Alternative products (isoforms),Organism,Protein names,canonical_isoform,unp_map_tage,yourlist,UniProt,GENE,GENE_status,Mapping_status
0,Q96NU1,SAMD11,reviewed,ALTERNATIVE PRODUCTS: Event=Alternative promo...,Homo sapiens (Human),Sterile alpha motif domain-containing protein ...,Q96NU1-3,Untrusted & No Isoform,NP_689699,Q96NU1,SAMD11,True,No
1,P43489,TNFRSF4 TXGP1L,reviewed,,Homo sapiens (Human),Tumor necrosis factor receptor superfamily mem...,,Trusted & No Isoform,NP_003318,P43489,TNFRSF4,True,Yes
2,A0A024R084,SDF4 hCG_19193,unreviewed,,Homo sapiens (Human),"Stromal cell derived factor 4, isoform CRA_c",,Trusted & No Isoform,NP_057260,A0A024R084,SDF4,True,No
3,Q96L58,B3GALT6,reviewed,,Homo sapiens (Human),"Beta-1,3-galactosyltransferase 6 (Beta-1,3-Gal...",,Trusted & No Isoform,NP_542172,Q96L58,B3GALT6,True,Yes
4,A7WPU8,SRY hCG_1733150,unreviewed,,Homo sapiens (Human),Sex-determining region Y protein,,Trusted & No Isoform,NP_003131,A7WPU8,SRY,True,No
5,Q05066,SRY TDF,reviewed,,Homo sapiens (Human),Sex-determining region Y protein (Testis-deter...,,Trusted & No Isoform,NP_003131,Q05066,SRY,True,Yes
6,A0A024R189,TBL1Y hCG_1997773,unreviewed,,Homo sapiens (Human),"Transducin (Beta)-like 1Y-linked, isoform CRA_a",,Trusted & No Isoform,NP_150600,A0A024R189,TBL1Y,True,No
7,Q9BQ87,TBL1Y TBL1,reviewed,,Homo sapiens (Human),F-box-like/WD repeat-containing protein TBL1Y ...,,Trusted & No Isoform,NP_150600,Q9BQ87,TBL1Y,True,Yes
8,A0A024R9E7,NLGN4Y hCG_1988457,unreviewed,,Homo sapiens (Human),"Neuroligin 4, Y-linked, isoform CRA_d",,Trusted & No Isoform,NP_055708,A0A024R9E7,NLGN4Y,True,No
9,O00468,AGRN AGRIN,reviewed,ALTERNATIVE PRODUCTS: Event=Alternative splic...,Homo sapiens (Human),Agrin [Cleaved into: Agrin N-terminal 110 kDa ...,O00468-1,Trusted & Isoform,NP_940978,O00468-6,AGRN,True,Yes


#### 1.1.5 Report Statistic

可以看到各列数据的缺失值情况、未匹配到UniProt ID的RefSeq ID以及匹配错误数据等

In [6]:
with open(reportPath, "rt") as reportFile:
    print(reportFile.read())

# RAW ID MAPPING FILE RESULT
# /home/zzf/Work/demo_files/Muta3dMaps/rawMapping.tsv
Entry                              0
Gene names                         0
Status                             0
Alternative products (isoforms)    8
Organism                           0
Protein names                      0
yourlist                           0
isomap                             9
dtype: int64
# All id: 8
# Unmapped id: 0
# Untrusted id: 1
NP_689699
# Error id: 0
Empty DataFrame
Columns: [Entry, Gene names, Status, Alternative products (isoforms), Organism, Protein names, canonical_isoform, unp_map_tage, yourlist, UniProt, GENE, GENE_status, Mapping_status]
Index: []


### 1.2 获取整合好的突变信息

In [7]:
print(unp_demo.muta_li)

RefSeq_protein
NP_003131    [I68T, F67L, F67V, A66T, N65H, N65D, M64I, M64...
NP_003318                                               [R65C]
NP_055708                                              [I679V]
NP_057260                                              [D295N]
NP_150600                                        [D69H, R176W]
NP_542172    [R6W, L26P, V61L, S65G, P67L, T79A, D144N, D15...
NP_689699                                        [K45E, P293A]
NP_940978    [G76S, N105I, A375S, T435M, G510S, L1176P, R15...
dtype: object


### 1.3 SIFTS数据集的创建

将Uniprot Sequence(包括isoform)上的突变位点信息映射到PDB结构上，就必须进行序列比对以获得残基水平的一一对应关系。而这个工作已由SIFTS完成。

SIFTS已将PDB的SEQRES序列与各UniProt Isoform进行序列比对，并且经常更新。通过调用SIFTS的[API接口](http://www.ebi.ac.uk/pdbe/api/mappings/all_isoforms/:accession)，解析JSON格式的返回数据，即可获取到PDB相关链与Uniprot Isoform的残基位置对应关系与范围以及序列相似性等信息。残基对应关系中的PDB残基号码是从1开始计数
的SEQRES残基索引。

#### 1.3.1 获取原始数据集

In [8]:
# SIFTS 原始数据文件路径
raw_sifts_file_path = 'pdb_uniprot_SIFTS_raw_demo.tsv'
# SIFTS 加上经过整合的覆盖范围区间信息后的文件路径
add_rangeInfo_sifts_file_path = 'pdb_uniprot_SIFTS_addRangeInfo_demo.tsv'
# SIFTS 判断PDB链相对UniProt Isoform的序列是否有Intertion,Deletion,并加上标签
add_InDe_sifts_file_path = 'pdb_uniprot_SIFTS_delwithInDe_demo.tsv'
# 中间文件保存文件夹
SIFTS_unit.CONFIG['DOWNLOAD_FOLDER'] = folder

# 初始化
sifts_demo = SIFTS_unit()
# 获取SIFTS当前可提供UniProt-PDB映射信息的UniProt与PDB ID
info_dict = sifts_demo.get_info_from_uniprot_pdb_file(
    related_unp=handled_ID_Mapping[handled_ID_Mapping["Mapping_status"]=="Yes"]["Entry"].to_list())
# 设置预备获取信息的PDB ID集
sifts_demo.pdb_list = sorted(info_dict['pdb_set'])
# 获取SIFTS 原始数据
sifts_demo.get_raw_SIFTS(outputPath='%s%s' % (SIFTS_unit.CONFIG['DOWNLOAD_FOLDER'], raw_sifts_file_path))

getSiftsInfo(): Start to get the pdb info from SIFTS. 1d0a
getSiftsInfo(): End a circle.[ 1d0a ] current: 1 ALL: 12
getSiftsInfo(): Start to get the pdb info from SIFTS. 1hry
getSiftsInfo(): End a circle.[ 1hry ] current: 2 ALL: 12
getSiftsInfo(): Start to get the pdb info from SIFTS. 1hrz
getSiftsInfo(): End a circle.[ 1hrz ] current: 3 ALL: 12
getSiftsInfo(): Start to get the pdb info from SIFTS. 1j46
getSiftsInfo(): End a circle.[ 1j46 ] current: 4 ALL: 12
getSiftsInfo(): Start to get the pdb info from SIFTS. 1j47
getSiftsInfo(): End a circle.[ 1j47 ] current: 5 ALL: 12
getSiftsInfo(): Start to get the pdb info from SIFTS. 2gzk
getSiftsInfo(): End a circle.[ 2gzk ] current: 6 ALL: 12
getSiftsInfo(): Start to get the pdb info from SIFTS. 2hev
getSiftsInfo(): End a circle.[ 2hev ] current: 7 ALL: 12
getSiftsInfo(): Start to get the pdb info from SIFTS. 2hey
getSiftsInfo(): End a circle.[ 2hey ] current: 8 ALL: 12
getSiftsInfo(): Start to get the pdb info from SIFTS. 6edb
getSiftsInfo(

(0, [])

#### 1.3.2 处理原始数据集

解析SIFTS提供的残基对应关系，可以间接判断出SEQRES序列相对于完整蛋白序列(Uniprot Sequence)的差异，包括Insertion，Deletion以及SEQRES序列头尾的差异部分 (SEQRES序列在头或尾具有而Uniprot序列不具有的部分)。

我们将原本SIFTS提供的pdb_start, pdb_end, unp_start, unp_end 残基水平对应关系转换为区间格式，如下表sifts_pdb_range, sifts_unp_range所示。

In [9]:
# 处理SIFTS原始数据: 加上经过整合的覆盖范围区间等信息
handle_sifts_df = sifts_demo.handle_SIFTS(outputPath='%s%s' % (SIFTS_unit.CONFIG['DOWNLOAD_FOLDER'], add_rangeInfo_sifts_file_path))
handle_sifts_df[["pdb_id", "chain_id","UniProt", "identity","identifier", "is_canonical", "sifts_pdb_range", "sifts_unp_range"]].head()

Unnamed: 0,pdb_id,chain_id,UniProt,identity,identifier,is_canonical,sifts_pdb_range,sifts_unp_range
0,1D0A,A,Q12933,0.994,TRAF2_HUMAN,True,"[[1, 168]]","[[334, 501]]"
1,1D0A,B,Q12933,0.994,TRAF2_HUMAN,True,"[[1, 168]]","[[334, 501]]"
2,1D0A,C,Q12933,0.994,TRAF2_HUMAN,True,"[[1, 168]]","[[334, 501]]"
3,1D0A,D,Q12933,0.994,TRAF2_HUMAN,True,"[[1, 168]]","[[334, 501]]"
4,1D0A,E,Q12933,0.994,TRAF2_HUMAN,True,"[[1, 168]]","[[334, 501]]"


大多数情况下，SIFTS提供的残基水平对应关系转换为区间后只会有一个子区间，pdb与unp子区间对应的残基序列长度是相等的，我们记为Safe。但是，存在特殊情况： 
* 对应子区间的长度并不总是相等   pdb链相对于uniprot Isoform Sequence发生了Deletion 
* 有时pdb与unp的对应关系会有多个子区间   pdb链相对于uniprot Isoform Sequence发生了Insertion 
* unp子区间之间并不连续   pdb链相对于uniprot Isoform Sequence发生了Insertion 以及 Deletion，存在较长差异序列
发现上述特殊情况中的Insertion以及Insertion & Deletion时后续可以调用以及封装好的函数进行pair-wise alignment(采用Biopyton模块的pairwise2)以确定具体的差异内容, 更正sifts_pdb_range, sifts_unp_range使得残基对应关系更为精确。

In [10]:
# 处理SIFTS原始数据: 判断PDB链相对UniProt Isoform的序列是否有Intertion,Deletion,并加上标签
sifts_demo.deal_with_insertionDeletion_SIFTS(sifts_df=handle_sifts_df, outputPath='%s%s' % (SIFTS_unit.CONFIG['DOWNLOAD_FOLDER'], add_InDe_sifts_file_path))
# 将RefSeq ID 信息进行整合
handle_sifts_df = pd.merge(handle_sifts_df, handled_ID_Mapping[['UniProt','yourlist']].drop_duplicates())
handle_sifts_df[["pdb_id", "chain_id", "UniProt", "yourlist", "identity","sifts_pdb_range", "sifts_unp_range", "delete","sifts_range_tage"]].head()

Unnamed: 0,pdb_id,chain_id,UniProt,yourlist,identity,sifts_pdb_range,sifts_unp_range,delete,sifts_range_tage
0,1D0A,G,P43489,NP_003318,0.833,"[[1, 6]]","[[261, 266]]",False,Safe
1,1D0A,H,P43489,NP_003318,0.833,"[[1, 6]]","[[261, 266]]",False,Safe
2,1D0A,I,P43489,NP_003318,0.833,"[[1, 6]]","[[261, 266]]",False,Safe
3,1D0A,J,P43489,NP_003318,0.833,"[[1, 6]]","[[261, 266]]",False,Safe
4,1D0A,K,P43489,NP_003318,0.833,"[[1, 6]]","[[261, 266]]",False,Safe


### 1.4 PDB文件的下载

#### 1.4.1 样例脚本

In [11]:
# 预下载的PDB
pdbs = handle_sifts_df['pdb_id'].drop_duplicates()
# 下载路径
path = folder + "MMCIF/"
# 默认下载MMCIF格式
mpw = MPWrapper(path)
fail = mpw.ftp_retrieve_batch(pdbs)
# 查看下载失败的PDB ID集
print(fail)

RetrievePDB: {downloadPath: /home/zzf/Work/demo_files/Muta3dMaps/MMCIF/, format: mmCIF, tail: .cif, raw_tail: .cif, prefix: , ftpSite: RCSB, host: ftp.rcsb.org, dividedPath: pub/pdb/data/structures/divided, len(pdbs): 0, len(fail): 0, }
220---------- Welcome to Pure-FTPd [privsep] [TLS] ----------
220-You are user number 2 of 500 allowed.
220-Local time is now 08:20. Server port: 2421.
220-Only anonymous FTP is allowed here
220 You will be disconnected after 15 minutes of inactivity.
Downloading File: /home/zzf/Work/demo_files/Muta3dMaps/MMCIF/1D0A.cif.gz
Downloading File: /home/zzf/Work/demo_files/Muta3dMaps/MMCIF/6EDB.cif.gz
Downloading File: /home/zzf/Work/demo_files/Muta3dMaps/MMCIF/2GZK.cif.gz
Downloading File: /home/zzf/Work/demo_files/Muta3dMaps/MMCIF/2HEV.cif.gz
Downloading File: /home/zzf/Work/demo_files/Muta3dMaps/MMCIF/2HEY.cif.gz
Downloading File: /home/zzf/Work/demo_files/Muta3dMaps/MMCIF/1HRY.cif.gz
Downloading File: /home/zzf/Work/demo_files/Muta3dMaps/MMCIF/1HRZ.cif.gz


In [12]:
print(os.listdir(path))

['.ipynb_checkpoints', '1D0A.cif', '6EDB.cif', '2GZK.cif', '2HEV.cif', '2HEY.cif', '1HRY.cif', '1HRZ.cif', '1J46.cif', '1J47.cif', '6OGX.cif', '6OKM.cif', '6OKN.cif']


### 1.5 MMCIF文件信息的获取与整合

#### 1.5.1 建立MMCIF信息数据集

In [13]:
# 设置文件路径
raw_mmcif_path = folder + 'rawMMCIF2Dfrm.tsv'
handled_mmcif_path = folder + 'handledMMCIF2Dfrm.tsv'
MMCIF_FILE_FOLDER['MMCIF_NEW_FOLDER'] = path

# 初始化
mmcif_demo = MMCIF2Dfrm()
# 检测pdbs文件路径,若未下载相关MMCIF文件则会自动下载
mmcif_demo.check_mmcif_file(pdbs)
# 接续
if os.path.exists(handled_mmcif_path):
    finished_li = set(pd.read_csv(handled_mmcif_path, sep='\t', usecols=['pdb_id'])['pdb_id'])
else:
    finished_li = []

mmcif_demo.update_mmcif_result(raw_mmcif_path, handled_mmcif_path, finished=finished_li)

RetrievePDB: {downloadPath: /home/zzf/Work/demo_files/Muta3dMaps/MMCIF/, format: mmCIF, tail: .cif, raw_tail: .cif, prefix: , ftpSite: RCSB, host: ftp.rcsb.org, dividedPath: pub/pdb/data/structures/divided, len(pdbs): 0, len(fail): 0, }
data_1D0A
data_2HEV
data_2HEY
data_6OGX
data_6OKM
data_6OKN
data_1HRY
data_1HRZ
data_1J46
data_1J47
data_2GZK
data_6EDB
handle_mmcif_data(): Modified Dict
handle_mmcif_data(): Modified Dfrm


#### 1.5.2 范例输出

In [14]:
handled_mmcif_df = pd.read_csv(handled_mmcif_path, sep="\t", converters={'chain_id':str, 'asym_id':str, 'entity_id':int})
handled_mmcif_df.head()

Unnamed: 0,entity_id,protein_type,pdb_id,chain_id,_pdbx_poly_seq_scheme.mon_id,_pdbx_poly_seq_scheme.pdb_mon_id,_pdbx_poly_seq_scheme.auth_mon_id,_pdbx_poly_seq_scheme.ndb_seq_num,_pdbx_poly_seq_scheme.pdb_seq_num,_pdbx_poly_seq_scheme.auth_seq_num,...,mutation_num,metal_ligand_num,Modification_num,seqres_len,coordinates_len,Modification_index,mis_index,mis_range,resolution_score,protein_chain_and_length
0,1,polypeptide(L),1D0A,A,AMADLEQKVLEMEASTYDGVFIWKISDFPRKRQEAVAGRIPAIFSP...,AMADLEQKVLEMEASTYDGVFIWKISDFPRKRQEAVAGRIPAIFSP...,AMADLEQKVLEMEASTYDGVFIWKISDFPRKRQEAVAGRIPAIFSP...,1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,334;335;336;337;338;339;340;341;342;343;344;34...,334;335;336;337;338;339;340;341;342;343;344;34...,...,0,0,0,168,168,[],[],,2.0,"[(168, ""A""), (168, ""B""), (168, ""C""), (168, ""D""..."
1,1,polypeptide(L),1D0A,B,AMADLEQKVLEMEASTYDGVFIWKISDFPRKRQEAVAGRIPAIFSP...,AMADLEQKVLEMEASTYDGVFIWKISDFPRKRQEAVAGRIPAIFSP...,AMADLEQKVLEMEASTYDGVFIWKISDFPRKRQEAVAGRIPAIFSP...,1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,334;335;336;337;338;339;340;341;342;343;344;34...,334;335;336;337;338;339;340;341;342;343;344;34...,...,0,0,0,168,168,[],[],,2.0,"[(168, ""A""), (168, ""B""), (168, ""C""), (168, ""D""..."
2,1,polypeptide(L),1D0A,C,AMADLEQKVLEMEASTYDGVFIWKISDFPRKRQEAVAGRIPAIFSP...,AMADLEQKVLEMEASTYDGVFIWKISDFPRKRQEAVAGRIPAIFSP...,AAAALAAAVLAMEASTYDGVFIWKISDFARKRQEAVAGRIPAIFSP...,1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,334;335;336;337;338;339;340;341;342;343;344;34...,334;335;336;337;338;339;340;341;342;343;344;34...,...,0,0,0,168,168,[],[],,2.0,"[(168, ""A""), (168, ""B""), (168, ""C""), (168, ""D""..."
3,1,polypeptide(L),1D0A,D,AMADLEQKVLEMEASTYDGVFIWKISDFPRKRQEAVAGRIPAIFSP...,AMADLEQKVLEMEASTYDGVFIWKISDFPRKRQEAVAGRIPAIFSP...,AMADLEQKVLEMEASTYDGVFIWKISDFPRKRQEAVAGRIPAIFSP...,1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,334;335;336;337;338;339;340;341;342;343;344;34...,334;335;336;337;338;339;340;341;342;343;344;34...,...,0,0,0,168,168,[],[],,2.0,"[(168, ""A""), (168, ""B""), (168, ""C""), (168, ""D""..."
4,1,polypeptide(L),1D0A,E,AMADLEQKVLEMEASTYDGVFIWKISDFPRKRQEAVAGRIPAIFSP...,AMADLEQKVLEMEASTYDGVFIWKISDFPRKRQEAVAGRIPAIFSP...,AMADLEQKVLEMEASTYDGVFIWKISDFPRKRQEAVAGRIPAIFSP...,1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;1...,334;335;336;337;338;339;340;341;342;343;344;34...,334;335;336;337;338;339;340;341;342;343;344;34...,...,0,0,0,168,168,[],[],,2.0,"[(168, ""A""), (168, ""B""), (168, ""C""), (168, ""D""..."


## 2 整合SIFTS数据与PDB(MMCIF格式)数据

In [15]:
sifts_mmcif_df = sifts_demo.add_mmcif_info_SIFTS(sifts_df=handle_sifts_df, mmcif_df=handled_mmcif_df)
sifts_mmcif_df.head()

Unnamed: 0,pdb_id,chain_id,UniProt,identity,identifier,is_canonical,start,end,entity_id,struct_asym_id,...,mutation_num,metal_ligand_num,Modification_num,seqres_len,coordinates_len,Modification_index,mis_index,mis_range,resolution_score,protein_chain_and_length
0,1D0A,G,P43489,0.833,TNR4_HUMAN,True,"{""author_residue_number"": 261, ""author_inserti...","{""author_residue_number"": 266, ""author_inserti...",2,G,...,0,1,1,6,6,[0],[],,2.0,"[(168, ""A""), (168, ""B""), (168, ""C""), (168, ""D""..."
1,1D0A,H,P43489,0.833,TNR4_HUMAN,True,"{""author_residue_number"": 261, ""author_inserti...","{""author_residue_number"": 266, ""author_inserti...",2,H,...,0,1,1,6,6,[0],[],,2.0,"[(168, ""A""), (168, ""B""), (168, ""C""), (168, ""D""..."
2,1D0A,I,P43489,0.833,TNR4_HUMAN,True,"{""author_residue_number"": 261, ""author_inserti...","{""author_residue_number"": 266, ""author_inserti...",2,I,...,0,1,1,6,6,[0],[],,2.0,"[(168, ""A""), (168, ""B""), (168, ""C""), (168, ""D""..."
3,1D0A,J,P43489,0.833,TNR4_HUMAN,True,"{""author_residue_number"": 261, ""author_inserti...","{""author_residue_number"": 266, ""author_inserti...",2,J,...,0,1,1,6,6,[0],[],,2.0,"[(168, ""A""), (168, ""B""), (168, ""C""), (168, ""D""..."
4,1D0A,K,P43489,0.833,TNR4_HUMAN,True,"{""author_residue_number"": 261, ""author_inserti...","{""author_residue_number"": 266, ""author_inserti...",2,K,...,0,1,1,6,6,[0],[],,2.0,"[(168, ""A""), (168, ""B""), (168, ""C""), (168, ""D""..."


## 3 将突变从UniProt映射至PDB链

### 3.1 对Deletion相关数据进行修正

发现上述特殊情况中的Insertion以及Insertion & Deletion时后续可以调用以及封装好的函数进行pair-wise alignment(采用Biopyton模块的pairwise2)以确定具体的差异内容, 更正sifts_pdb_range, sifts_unp_range使得残基对应关系更为精确

In [16]:
unp_fasta_files_path = folder + '/fasta_files/%s.fasta'
update_sifts_mmcif_df = sifts_demo.update_range_info_SIFTS(unp_fasta_files_path, sifts_df=sifts_mmcif_df)
update_sifts_mmcif_df.head()

Unnamed: 0,pdb_id,chain_id,UniProt,identity,identifier,is_canonical,start,end,entity_id,struct_asym_id,...,Modification_num,seqres_len,coordinates_len,Modification_index,mis_index,mis_range,resolution_score,protein_chain_and_length,new_sifts_unp_range,new_sifts_pdb_range
0,1D0A,G,P43489,0.833,TNR4_HUMAN,True,"{""author_residue_number"": 261, ""author_inserti...","{""author_residue_number"": 266, ""author_inserti...",2,G,...,1,6,6,[0],[],,2.0,"[(168, ""A""), (168, ""B""), (168, ""C""), (168, ""D""...","[[261, 266]]","[[1, 6]]"
1,1D0A,H,P43489,0.833,TNR4_HUMAN,True,"{""author_residue_number"": 261, ""author_inserti...","{""author_residue_number"": 266, ""author_inserti...",2,H,...,1,6,6,[0],[],,2.0,"[(168, ""A""), (168, ""B""), (168, ""C""), (168, ""D""...","[[261, 266]]","[[1, 6]]"
2,1D0A,I,P43489,0.833,TNR4_HUMAN,True,"{""author_residue_number"": 261, ""author_inserti...","{""author_residue_number"": 266, ""author_inserti...",2,I,...,1,6,6,[0],[],,2.0,"[(168, ""A""), (168, ""B""), (168, ""C""), (168, ""D""...","[[261, 266]]","[[1, 6]]"
3,1D0A,J,P43489,0.833,TNR4_HUMAN,True,"{""author_residue_number"": 261, ""author_inserti...","{""author_residue_number"": 266, ""author_inserti...",2,J,...,1,6,6,[0],[],,2.0,"[(168, ""A""), (168, ""B""), (168, ""C""), (168, ""D""...","[[261, 266]]","[[1, 6]]"
4,1D0A,K,P43489,0.833,TNR4_HUMAN,True,"{""author_residue_number"": 261, ""author_inserti...","{""author_residue_number"": 266, ""author_inserti...",2,K,...,1,6,6,[0],[],,2.0,"[(168, ""A""), (168, ""B""), (168, ""C""), (168, ""D""...","[[261, 266]]","[[1, 6]]"


### 3.2 映射

#### 3.2.1 范例脚本

In [17]:
# 将突变数据整合
update_sifts_mmcif_df['mutation_unp'] = update_sifts_mmcif_df.apply(lambda x: unp_demo.muta_li[x['yourlist']], axis=1)
# PDB突变位点信息
muta_info_li = []
update_sifts_mmcif_df['mutation_pdb'] = update_sifts_mmcif_df.apply(lambda x: SIFTS_unit.map_muta_from_unp_to_pdb(x, 'mutation_unp', 'new_sifts_unp_range', 'new_sifts_pdb_range', muta_info_li, unp_fasta_files_path) if not isinstance(x['new_sifts_pdb_range'], float) and not isinstance(x['mutation_unp'], float) else np.nan, axis=1)
# 映射情况信息
update_sifts_mmcif_df['muta_map_info'] = pd.Series(muta_info_li, index=update_sifts_mmcif_df.dropna(subset=['mutation_unp']).index)

#### 3.2.2 范例输出

对于下面的输出数据，可以看到突变数据从RefSeq到PDB三维结构位点的映射情况，参考```muta_map_info```列可得知具体映射结果：
* Safe: 突变位点成功映射到PDB三维结构上，且氨基酸残基正确对应
* PossibleMutation: 突变位点成功映射到PDB三维结构上，但氨基酸残基错误对应
* EntityMutation:  突变位点成功映射到PDB三维结构上，但氨基酸残基错误对应，且可知对应PDB链上该位点本身存在突变数据
* Missing：突变位点成功映射到PDB三维结构上，但是该位点缺失空间坐标信息
* Unmapped #：突变位点没有映射到PDB三维结构上

In [18]:
update_sifts_mmcif_df[['yourlist', 'UniProt', 'pdb_id', 'chain_id', 'identity', 'new_sifts_pdb_range', 'new_sifts_unp_range', 'sifts_range_tage','mutation_unp', 'mutation_pdb', 'muta_map_info']]

Unnamed: 0,yourlist,UniProt,pdb_id,chain_id,identity,new_sifts_pdb_range,new_sifts_unp_range,sifts_range_tage,mutation_unp,mutation_pdb,muta_map_info
0,NP_003318,P43489,1D0A,G,0.833,"[[1, 6]]","[[261, 266]]",Safe,[R65C],[#],[Unmapped #: R65C]
1,NP_003318,P43489,1D0A,H,0.833,"[[1, 6]]","[[261, 266]]",Safe,[R65C],[#],[Unmapped #: R65C]
2,NP_003318,P43489,1D0A,I,0.833,"[[1, 6]]","[[261, 266]]",Safe,[R65C],[#],[Unmapped #: R65C]
3,NP_003318,P43489,1D0A,J,0.833,"[[1, 6]]","[[261, 266]]",Safe,[R65C],[#],[Unmapped #: R65C]
4,NP_003318,P43489,1D0A,K,0.833,"[[1, 6]]","[[261, 266]]",Safe,[R65C],[#],[Unmapped #: R65C]
5,NP_003318,P43489,1D0A,L,0.833,"[[1, 6]]","[[261, 266]]",Safe,[R65C],[#],[Unmapped #: R65C]
6,NP_003318,P43489,2HEV,R,1.0,"[[5, 146]]","[[29, 170]]",Safe,[R65C],[65],[Safe]
7,NP_003318,P43489,2HEY,R,1.0,"[[5, 146]]","[[29, 170]]",Safe,[R65C],[65],[Safe]
8,NP_003318,P43489,2HEY,T,1.0,"[[5, 146]]","[[29, 170]]",Safe,[R65C],[65],[Safe]
9,NP_003318,P43489,6OGX,G,0.929,"[[22, 163]]","[[29, 170]]",Safe,[R65C],[65],[Safe]


## *4 Interactome3D meta-data的获取

> 该步骤与上述步骤不一定相关

In [19]:
from Interactome3D_unit import Interactome3D_unit

interactDemo = Interactome3D_unit()
# 原文件下载路径
interactDemo.CONFIG['DOWNLOAD_FOLDER'] = folder
# 获取经过处理的整合文件
interact_df = interactDemo.get_interactions_meta(outputPath=interactDemo.CONFIG['DOWNLOAD_FOLDER']+'interactions_modified.tsv')
interact_df.head()

Unnamed: 0,TYPE,PDB_ID,BIO_UNIT,FILENAME,group_compo,PROT,CHAIN,MODEL,SEQ_IDENT,COVERAGE,SEQ_BEGIN,SEQ_END,DOMAIN,model_len,model_range
0,Structure,2MSE,1,A0A024RAV5-P02647-EXP-2mse.pdb1-B-0-A-0.pdb,A0A024RAV5_P02647,A0A024RAV5,B,0,100.0,98.4,1,185,-,185,"[[1, 185]]"
1,Structure,2MSE,1,A0A024RAV5-P10398-EXP-2mse.pdb1-B-0-D-0.pdb,A0A024RAV5_P10398,A0A024RAV5,B,0,100.0,98.4,1,185,-,185,"[[1, 185]]"
2,Structure,4OV6,1,A0A075B5G3-Q8NBP7-EXP-4ov6.pdb1-F-0-B-0.pdb,A0A075B5G3_Q8NBP7,A0A075B5G3,F,0,100.0,100.0,1,99,-,99,"[[1, 99]]"
3,Structure,4OV6,2,A0A075B5G3-Q8NBP7-EXP-4ov6.pdb2-G-0-E-0.pdb,A0A075B5G3_Q8NBP7,A0A075B5G3,G,0,100.0,97.0,1,96,-,96,"[[1, 96]]"
4,Structure,4OV6,1,A0A075B5G3-Q8NBP7-EXP-4ov6.pdb1-F-0-A-0.pdb,A0A075B5G3_Q8NBP7,A0A075B5G3,F,0,100.0,100.0,1,99,-,99,"[[1, 99]]"
