# Analyse not found mutations 
Analyze reasons why MicroMiner does not find certain benchmark mutations 

In [None]:
%matplotlib inline

import matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import sys
import tempfile
from pathlib import Path

# project root dir
root_dir = Path('/local/sieg/projekte/microminer_evaluation')
sys.path.insert(0, str(root_dir.resolve()))

import helper
from helper.constants import one_2_three_dict
from helper.tmalign import call_tmalign, parse_stdout
from helper.utils import unpack_gz
from helper.datasets.utils import get_pdb_file_path

In [2]:
exp_search_dir = root_dir / 'results' / 'mutation_experiments' / 'search'
exp_pair_dir = root_dir / 'results' / 'mutation_experiments' / 'pair'

dataset_collection = helper.get_dataset_collection()
datasets = [dataset.name for dataset in dataset_collection.get_mutation_datasets_with_structure_pairs()]
datasets

['protherm', 'platinum', 'thermomutdb', 'shanthirabalan']

In [3]:
aa3to1 = {'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',
          'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N', 
          'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W', 
          'ALA': 'A', 'VAL':'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'}
dtypes_dict = {'pdb_wild': str, 'pdb_mutant': str, 'wild_chain': str, 'wild_aa': str, 'seq_num': str, 'mutation_aa': str, 'reason': str}

In [4]:
df_problematic = pd.read_csv(root_dir / 'data' / 'problematic_mutations_iteration5.csv', sep='\t', header=0, dtype=dtypes_dict)
df_problematic

Unnamed: 0,pdb_wild,pdb_mutant,wild_chain,wild_aa,seq_num,mutation_aa,reason
0,2LZM,128L,A,A,93,T,both A
1,1POH,1SPH,A,S,46,A,to D not A
2,2LZM,1L68,A,N,68,A,both N
3,1LZ1,1GAY,A,V,74,G,both V
4,1LZ1,1GAZ,A,V,74,I,both V
...,...,...,...,...,...,...,...
184,1CNQ,1YXI,A,A,54,L,PPI. Chain C missing in 1YXI
185,2TDM,1TSY,A,R,179,K,PPI. Chain B missing in 1TSY
186,2TDM,1TSV,A,R,179,A,PPI. Chain B missing in 1TSV
187,1AMK,1QDS,A,E,65,Q,PPI. Chain B missing in 1QDS


In [6]:
df_list = []
for dataset in datasets:
    
    # read all mutations not found with MM
    relevant_cols_dtypes = {helper.WILD_COL: str, helper.MUTANT_COL: str, helper.WILD_AA: str, helper.MUT_AA: str, helper.WILD_SEQ_NUM: str}
    df_all_notfound = pd.read_csv(exp_search_dir / dataset / 'report' / f'{dataset}_not_found_eval.tsv', sep='\t', header=0, dtype=relevant_cols_dtypes)
    
    # use wild-type chain id if mutation data set has annotation for it
    relevant_cols_dataset = list(relevant_cols_dtypes.keys())
    relevant_cols_reviewed_mutations = ['pdb_wild', 'pdb_mutant', 'wild_aa', 'mutation_aa', 'seq_num']
    if helper.WILD_CHAIN in df_all_notfound.columns:
        relevant_cols_dataset.append(helper.WILD_CHAIN)
        relevant_cols_reviewed_mutations.append('wild_chain')
    df_all_notfound = df_all_notfound[relevant_cols_dataset]
    assert df_all_notfound.shape[0] == df_all_notfound.drop_duplicates().shape[0]

    # remove mutations which we checked manually and found they are errornously annotated 
    # or do not fit MMs matching criteria
    df_merged = df_all_notfound.merge(df_problematic, left_on=relevant_cols_dataset,
                                  right_on=relevant_cols_reviewed_mutations,
                                  how='left', indicator=True)
    df_merged = df_merged[df_merged['_merge'] == 'left_only'][relevant_cols_dataset]
    df_merged['dataset'] = dataset
    df_list.append(df_merged)
    
    # remove reference to keep memory in notebook low
    df_merged = None 
    df_all_notfound = None

df_notfound = pd.concat(df_list)
print('These are the cases where MicroMiner failed unexpectedly:')
df_notfound

These are the cases where MicroMiner failed unexpectedly:


Unnamed: 0,wild_pdb,mut_pdb,wild_aa,mut_aa,wild_seq_num,dataset,wild_chain


In [7]:

# # read all mutations not found by MicroMiner
# dataset = datasets[3]
# df_notfound = pd.read_csv(exp_search_dir / dataset / 'report' / f'{dataset}_not_found_eval.tsv', sep='\t', header=0,
#                           usecols=[helper.WILD_COL, helper.MUTANT_COL, helper.WILD_AA, helper.MUT_AA, helper.WILD_SEQ_NUM])
# # df_notfound['wild_aa'] = df_notfound['wild_aa3'].apply(lambda aa: aa3to1[aa])
# # df_notfound['mutant_aa'] = df_notfound['mutant_aa3'].apply(lambda aa: aa3to1[aa])

# assert df_notfound.shape[0] == df_notfound.drop_duplicates().shape[0]

# # sort out the mutations which we checked manually and found they are errornous.
# left_on = [helper.WILD_COL, helper.MUTANT_COL, helper.WILD_AA, helper.MUT_AA, helper.WILD_SEQ_NUM]
# right_on = ['pdb_wild', 'pdb_mutant', 'wild_aa', 'mutation_aa', 'seq_num']
# 
# df_merged = df_notfound.merge(df_problematic2, left_on=left_on, right_on=right_on,
#                            how='left', indicator=True)
# df_notfound = df_merged[df_merged['_merge'] == 'left_only'][left_on]

# # for the remaining mutations not found by MicroMine we run TMalign to check if they are at least related
# tmalign_results = []
# for wild_name, mutant_name in zip(df_notfound[helper.WILD_COL], df_notfound[helper.MUTANT_COL]):
#     with tempfile.TemporaryDirectory() as tmpdirname:
#         tmpdir_path = Path(tmpdirname)
#         pdb_query_path = get_pdb_file_path(wild_name, mirror=dataset)
#         pdb_target_path = get_pdb_file_path(mutant_name)
    
#         # tmalign can not read gz files
#         if pdb_query_path.suffixes[-1] == '.gz':
#             unpack_gz(pdb_query_path, tmpdir_path / pdb_query_path.stem)
#             pdb_query_path = tmpdir_path / pdb_query_path.stem
#         if pdb_target_path.suffixes[-1] == '.gz':
#             unpack_gz(pdb_target_path, tmpdir_path / pdb_target_path.stem)
#             pdb_target_path = tmpdir_path / pdb_target_path.stem
            
#         out_prefix = f'tmalign_stdout_{pdb_query_path.name}_{pdb_target_path.name}'
#         stdout = call_tmalign(pdb_query_path, pdb_target_path, tmpdir_path / out_prefix,
#                               out_prefix,
#                               split_flag=2,
#                               ter_flag=0,
#                               write_rotation=False, raise_error=True)

#         if not stdout:
#             raise ValueError('Failed TMalign calculation')

#         info_dicts = parse_stdout(stdout.decode(), read_alignment=False)
#         for info_d in info_dicts:
#             info_d[helper.WILD_COL] = wild_name[3:].upper() if wild_name.startswith('pdb') else wild_name.upper()
#             info_d[helper.MUTANT_COL] = mutant_name[3:].upper() if mutant_name.startswith('pdb') else mutant_name.upper()
#         tmalign_results.extend(info_dicts)
        
# # print(tmalign_results)

# df_tmalign = pd.DataFrame(tmalign_results)
# df_tmalign_filtered = df_tmalign.sort_values(['tm_score1', 'tm_score2']).drop_duplicates(['id1', 'id2'])
# df_notfound_tmalign = df_tmalign_filtered.merge(df_notfound, on=[helper.WILD_COL, helper.MUTANT_COL])
# df_notfound_tmalign = df_notfound_tmalign.drop(['id1', 'id2'], axis=1)
# df_notfound_tmalign

In [8]:
# hand curated list of mutations not found by MicroMiner and why they are not found. Note that this is not the complete list.

prob_dict = [
    {'pdb_wild': '1BNI', 'pdb_mutant': '1BGD', 'wild_chain': 'A', 'wild_aa': 'I',	'seq_num': '96', 'mutation_aa': 'V', 'reason': 'different proteins'},
    {'pdb_wild': '2RN2', 'pdb_mutant': '1RBN', 'wild_chain': 'A', 'wild_aa': 'K',	'seq_num': '95', 'mutation_aa': 'N', 'reason': 'different proteins'},
    {'pdb_wild': '1LZ1', 'pdb_mutant': '1YAG', 'wild_chain': 'A', 'wild_aa': 'I',	'seq_num': '89', 'mutation_aa': 'V', 'reason': 'different proteins'},
    {'pdb_wild': '1LZ1', 'pdb_mutant': '1GA2', 'wild_chain': 'A', 'wild_aa': 'V',	'seq_num': '2', 'mutation_aa': 'I', 'reason': 'different proteins'},
    {'pdb_wild': '1LZ1', 'pdb_mutant': '1GA0', 'wild_chain': 'A', 'wild_aa': 'V',	'seq_num': '2', 'mutation_aa': 'L', 'reason': 'different proteins'},
    {'pdb_wild': '1LZ1', 'pdb_mutant': '1GA0', 'wild_chain': 'A', 'wild_aa': 'V',	'seq_num': '74', 'mutation_aa': 'L', 'reason': 'different proteins'},
    {'pdb_wild': '1LZ1', 'pdb_mutant': '1GA0', 'wild_chain': 'A', 'wild_aa': 'V',	'seq_num': '110', 'mutation_aa': 'L', 'reason': 'different proteins'},
    {'pdb_wild': '1ARR', 'pdb_mutant': '1MYK', 'wild_chain': 'A', 'wild_aa': 'P',	'seq_num': '8', 'mutation_aa': 'L', 'reason': 'nearby terminus unresolved in 1MYK'},
    {'pdb_wild': '2LZM', 'pdb_mutant': '1QTH', 'wild_chain': 'A', 'wild_aa': 'A',	'seq_num': '98', 'mutation_aa': 'M', 'reason': 'adjacent C 97 A'},
    {'pdb_wild': '1BPI', 'pdb_mutant': '1BPT', 'wild_chain': 'A', 'wild_aa': 'Y',	'seq_num': '23', 'mutation_aa': 'A', 'reason': 'terminal G57,A58 unresolved in 1BPT'},
    {'pdb_wild': '4LYZ', 'pdb_mutant': '1JKB', 'wild_chain': 'A', 'wild_aa': 'E',	'seq_num': '35', 'mutation_aa': 'A', 'reason': 'multiple mutations; adjacent F 34 W'},
    {'pdb_wild': '2LZM', 'pdb_mutant': '1QS5', 'wild_chain': 'A', 'wild_aa': 'A',	'seq_num': '98', 'mutation_aa': 'L', 'reason': 'adjacent C 97 A'},
    {'pdb_wild': '2LZM', 'pdb_mutant': '1L95', 'wild_chain': 'A', 'wild_aa': 'F',	'seq_num': '153', 'mutation_aa': 'V', 'reason': 'C 97 A within 6.5A'},
    {'pdb_wild': '2LZM', 'pdb_mutant': '1L86', 'wild_chain': 'A', 'wild_aa': 'F',	'seq_num': '153', 'mutation_aa': 'I', 'reason': 'C 97 A within 6.5A'},
    {'pdb_wild': '2LZM', 'pdb_mutant': '1L85', 'wild_chain': 'A', 'wild_aa': 'F',	'seq_num': '153', 'mutation_aa': 'A', 'reason': 'C 97 A within 6.5A'},
    {'pdb_wild': '2LZM', 'pdb_mutant': '244L', 'wild_chain': 'A', 'wild_aa': 'I',	'seq_num': '100', 'mutation_aa': 'A', 'reason': 'near mutation C 97 A'},
    {'pdb_wild': '2LZM', 'pdb_mutant': '1L55', 'wild_chain': 'A', 'wild_aa': 'D',	'seq_num': '92', 'mutation_aa': 'N', 'reason': 'near mutation C 97 A'},
    {'pdb_wild': '2LZM', 'pdb_mutant': '1QSB', 'wild_chain': 'A', 'wild_aa': 'A',	'seq_num': '98', 'mutation_aa': 'C', 'reason': 'adjacent C 97 A'},
    {'pdb_wild': '2LZM', 'pdb_mutant': '1QS9', 'wild_chain': 'A', 'wild_aa': 'A',	'seq_num': '98', 'mutation_aa': 'V', 'reason': 'adjacent C 97 A'},
    {'pdb_wild': '2LZM', 'pdb_mutant': '1L88', 'wild_chain': 'A', 'wild_aa': 'F',	'seq_num': '153', 'mutation_aa': 'M', 'reason': 'C 97 A within 6.5A'},
    {'pdb_wild': '2LZM', 'pdb_mutant': '1L77', 'wild_chain': 'A', 'wild_aa': 'M',	'seq_num': '102', 'mutation_aa': 'L', 'reason': 'near mutation C 97 A'},
    {'pdb_wild': '2LZM', 'pdb_mutant': '1L66', 'wild_chain': 'A', 'wild_aa': 'K',	'seq_num': '43', 'mutation_aa': 'A', 'reason': 'C 54 T within 6.5A'},
    {'pdb_wild': '2LZM', 'pdb_mutant': '1L87', 'wild_chain': 'A', 'wild_aa': 'F',	'seq_num': '153', 'mutation_aa': 'L', 'reason': 'C 97 A within 6.5A'},
    {'pdb_wild': '2LZM', 'pdb_mutant': '125L', 'wild_chain': 'A', 'wild_aa': 'A',	'seq_num': '98', 'mutation_aa': 'S', 'reason': 'adjacent C 97 A'},
    {'pdb_wild': '2LZM', 'pdb_mutant': '1L90', 'wild_chain': 'A', 'wild_aa': 'L',	'seq_num': '99', 'mutation_aa': 'A', 'reason': 'near mutation C 97 A'},
    {'pdb_wild': '2LZM', 'pdb_mutant': '243L', 'wild_chain': 'A', 'wild_aa': 'I',	'seq_num': '58', 'mutation_aa': 'A', 'reason': 'near mutation C 54 T'},
    {'pdb_wild': '2LZM', 'pdb_mutant': '1L67', 'wild_chain': 'A', 'wild_aa': 'L',	'seq_num': '46', 'mutation_aa': 'A', 'reason': 'near mutation C 54 T'},
    {'pdb_wild': '2LZM', 'pdb_mutant': '126L', 'wild_chain': 'A', 'wild_aa': 'V',	'seq_num': '149', 'mutation_aa': 'T', 'reason': 'C 97 A within 6.5A'},
    {'pdb_wild': '2LZM', 'pdb_mutant': '1L65', 'wild_chain': 'A', 'wild_aa': 'D',	'seq_num': '47', 'mutation_aa': 'A', 'reason': 'near mutation C 54 T'},
    {'pdb_wild': '2LZM', 'pdb_mutant': '224L', 'wild_chain': 'A', 'wild_aa': 'A',	'seq_num': '93', 'mutation_aa': 'S', 'reason': 'near mutation C 97 A'},
    {'pdb_wild': '2LZM', 'pdb_mutant': '1L91', 'wild_chain': 'A', 'wild_aa': 'L',	'seq_num': '99', 'mutation_aa': 'F', 'reason': 'near mutation C 97 A'},
    {'pdb_wild': '2LZM', 'pdb_mutant': '129L', 'wild_chain': 'A', 'wild_aa': 'A',	'seq_num': '93', 'mutation_aa': 'T', 'reason': 'near mutation C 97 A'},
    {'pdb_wild': '2LZM', 'pdb_mutant': '1L94', 'wild_chain': 'A', 'wild_aa': 'L',	'seq_num': '99', 'mutation_aa': 'V', 'reason': 'near mutation C 97 A'},
    {'pdb_wild': '2LZM', 'pdb_mutant': '1L92', 'wild_chain': 'A', 'wild_aa': 'L',	'seq_num': '99', 'mutation_aa': 'I', 'reason': 'near mutation C 97 A'},
    {'pdb_wild': '2LZM', 'pdb_mutant': '1L93', 'wild_chain': 'A', 'wild_aa': 'L',	'seq_num': '99', 'mutation_aa': 'M', 'reason': 'near mutation C 97 A'},
    {'pdb_wild': '1LZ1', 'pdb_mutant': '2BQO', 'wild_chain': 'A', 'wild_aa': 'V',	'seq_num': '99', 'mutation_aa': 'A', 'reason': 'near mutation C 95 A'},
    {'pdb_wild': '1LZ1', 'pdb_mutant': '2BQN', 'wild_chain': 'A', 'wild_aa': 'V',	'seq_num': '93', 'mutation_aa': 'A', 'reason': 'near mutation C 95 A'},
    {'pdb_wild': '1LZ1', 'pdb_mutant': '2BQE', 'wild_chain': 'A', 'wild_aa': 'I',	'seq_num': '59', 'mutation_aa': 'V', 'reason': 'near mutation C 95 A'},
    {'pdb_wild': '1LZ1', 'pdb_mutant': '2BQG', 'wild_chain': 'A', 'wild_aa': 'V',	'seq_num': '100', 'mutation_aa': 'A', 'reason': 'C 95 A within 6.5A'},
    {'pdb_wild': '1LZ1', 'pdb_mutant': '2BQM', 'wild_chain': 'A', 'wild_aa': 'V',	'seq_num': '74', 'mutation_aa': 'A', 'reason': 'near mutation C 77 A'},
    {'pdb_wild': '2LZM', 'pdb_mutant': '242L', 'wild_chain': 'A', 'wild_aa': 'I',	'seq_num': '50', 'mutation_aa': 'A', 'reason': 'near mutation C 54 T'},
    {'pdb_wild': '2LZM', 'pdb_mutant': '237L', 'wild_chain': 'A', 'wild_aa': 'V',	'seq_num': '149', 'mutation_aa': 'A', 'reason': 'C 97 A within 6.5A'},
    {'pdb_wild': '2LZM', 'pdb_mutant': '230L', 'wild_chain': 'A', 'wild_aa': 'M',	'seq_num': '6', 'mutation_aa': 'L', 'reason': 'near mutation C 97 A'},
    {'pdb_wild': '2LZM', 'pdb_mutant': '1L54', 'wild_chain': 'A', 'wild_aa': 'M',	'seq_num': '102', 'mutation_aa': 'K', 'reason': 'near mutation C 97 A'},
    
    # thermomutdb 
    #2LZM 	1QUD 	L 	G 	99
    {'pdb_wild': '2LZM', 'pdb_mutant': '1QUD', 'wild_chain': 'A', 'wild_aa': 'L',	'seq_num': '99', 'mutation_aa': 'G', 'reason': 'near mutation C 97 A'},
    # 2RN5 	1KVB 	D 	N 	134
    {'pdb_wild': '2RN5', 'pdb_mutant': '1KVB', 'wild_chain': 'A', 'wild_aa': 'D',	'seq_num': '134', 'mutation_aa': 'N', 'reason': 'different proteins'},
    # 1PGA 	6HKA 	A 	L 	34
    {'pdb_wild': '1PGA', 'pdb_mutant': '6HKA', 'wild_chain': 'A', 'wild_aa': 'A',	'seq_num': '34', 'mutation_aa': 'L', 'reason': 'different proteins'},
    # 1R2Y 	3GQ8 	Q 	R 	234
    {'pdb_wild': '1R2Y', 'pdb_mutant': '3GQ8', 'wild_chain': 'A', 'wild_aa': 'Q',	'seq_num': '234', 'mutation_aa': 'R', 'reason': 'different proteins'},
    # 1R2Y 	3GQ9 	R 	E 	244
    {'pdb_wild': '1R2Y', 'pdb_mutant': '3GQ9', 'wild_chain': 'A', 'wild_aa': 'R',	'seq_num': '244', 'mutation_aa': 'E', 'reason': 'different proteins'},
    # 2RN4 	1KVB 	K 	G 	95
    {'pdb_wild': '2RN4', 'pdb_mutant': '1KVB', 'wild_chain': 'A', 'wild_aa': 'K',	'seq_num': '95', 'mutation_aa': 'G', 'reason': 'different proteins'},
    # 2EWN 	4WF2 	G 	A 	142
    {'pdb_wild': '2EWN', 'pdb_mutant': '4WF2', 'wild_chain': 'A', 'wild_aa': 'G',	'seq_num': '142', 'mutation_aa': 'A', 'reason': '4WF2 has nearby chain break'},
    # 4EYB 	5JQJ 	M 	V 	154
    {'pdb_wild': '4EYB', 'pdb_mutant': '5JQJ', 'wild_chain': 'A', 'wild_aa': 'M',	'seq_num': '154', 'mutation_aa': 'V', 'reason': 'Q 151 R within 6.5A'},
    # 1OH0 	1DMM 	Y 	F 	57
    {'pdb_wild': '1OH0', 'pdb_mutant': '1DMM', 'wild_chain': 'A', 'wild_aa': 'Y',	'seq_num': '57', 'mutation_aa': 'F', 'reason': 'near chain break in 1DMM'},
    
    # shanthirabalan
    # 2J8C 	3DUQ 	N 	D 	5
    {'pdb_wild': '2J8C', 'pdb_mutant': '3DUQ', 'wild_chain': 'M', 'wild_aa': 'N',	'seq_num': '5', 'mutation_aa': 'D', 'reason': 'nearby terminus unresolved in 3DUQ'},
    # 4BFL 	3TTT 	F 	Y 	413
#     {'pdb_wild': '4BFL', 'pdb_mutant': '3TTT', 'wild_chain': 'A', 'wild_aa': 'F',	'seq_num': '413', 'mutation_aa': 'Y', 'reason': 'PPI: near other mutation chain B F 413 Y'},
    # 2DEK 	2EH2 	V 	M 	18
#     {'pdb_wild': '2DEK', 'pdb_mutant': '2EH2', 'wild_chain': 'A', 'wild_aa': 'V',	'seq_num': '18', 'mutation_aa': 'M', 'reason': 'PPI: near other mutation chain B V 18 M'},
    # 2DEK 	2ELE 	V 	C 	18
#     {'pdb_wild': '2DEK', 'pdb_mutant': '2ELE', 'wild_chain': 'A', 'wild_aa': 'V',	'seq_num': '18', 'mutation_aa': 'C', 'reason': 'PPI: near other mutation chain B V 18 C'},
    # 2DEK 	2ELD 	L 	M 	160
#     {'pdb_wild': '2DEK', 'pdb_mutant': '2ELD', 'wild_chain': 'A', 'wild_aa': 'L',	'seq_num': '160', 'mutation_aa': 'M', 'reason': 'PPI: near other mutation chain B L 160 M'},
    # 2DEK 	2PB4 	K 	M 	132
#     {'pdb_wild': '2DEK', 'pdb_mutant': '2PB4', 'wild_chain': 'A', 'wild_aa': 'K',	'seq_num': '132', 'mutation_aa': 'M', 'reason': 'PPI: near other mutation chain B K 132 M'},
    # 2ILI 	4QK1 	K 	P 	170
    {'pdb_wild': '2ILI', 'pdb_mutant': '4QK1', 'wild_chain': 'A', 'wild_aa': 'K',	'seq_num': '170', 'mutation_aa': 'P', 'reason': 'terminal H3 unresolved in 4QK1'},
    # 2ILI 	3RGE 	W 	H 	5
    {'pdb_wild': '2ILI', 'pdb_mutant': '3RGE', 'wild_chain': 'A', 'wild_aa': 'W',	'seq_num': '5', 'mutation_aa': 'H', 'reason': 'terminal H3,H4 unresolved in 3RGE'},
    #1LW9 	1LLH 	T 	I 	157
    {'pdb_wild': '1LW9', 'pdb_mutant': '1LLH', 'wild_chain': 'A', 'wild_aa': 'T',	'seq_num': '157', 'mutation_aa': 'I', 'reason': 'terminal N163,L164 unresolved in 1LLH'},
    # 5DEI 	1PI3 	E 	Q 	29
#     {'pdb_wild': '5DEI', 'pdb_mutant': '1PI3', 'wild_chain': 'A', 'wild_aa': 'E',	'seq_num': '29', 'mutation_aa': 'Q', 'reason': 'PPI. Chain C missing in 1PI3'},
    # 5DEI 	4K9M 	H 	N 	282
#     {'pdb_wild': '5DEI', 'pdb_mutant': '4K9M', 'wild_chain': 'A', 'wild_aa': 'H',	'seq_num': '282', 'mutation_aa': 'N', 'reason': 'PPI. Chain C missing in 4K9M'},
    # 5DEI 	1PO7 	E 	A 	29
#     {'pdb_wild': '5DEI', 'pdb_mutant': '1PO7', 'wild_chain': 'A', 'wild_aa': 'E',	'seq_num': '29', 'mutation_aa': 'A', 'reason': 'PPI. Chain C missing in 1PO7'},
    # 5DEI 	4MPR 	R 	E 	142
#     {'pdb_wild': '5DEI', 'pdb_mutant': '4MPR', 'wild_chain': 'A', 'wild_aa': 'R',	'seq_num': '142', 'mutation_aa': 'E', 'reason': 'PPI. Chain B,D missing in 4MPR'},
    # 5DEI 	4MQ5 	A 	F 	307
#     {'pdb_wild': '5DEI', 'pdb_mutant': '4MQ5', 'wild_chain': 'A', 'wild_aa': 'A',	'seq_num': '307', 'mutation_aa': 'F', 'reason': 'PPI. Chain D missing in 4MQ5'},
    # 5DEI 	4JUD 	S 	T 	27
#     {'pdb_wild': '5DEI', 'pdb_mutant': '4JUD', 'wild_chain': 'A', 'wild_aa': 'S',	'seq_num': '27', 'mutation_aa': 'T', 'reason': 'PPI. Chain C missing in 4JUD'},
    # 5DEI 	4JU8 	H 	F 	71
#     {'pdb_wild': '5DEI', 'pdb_mutant': '4JU8', 'wild_chain': 'A', 'wild_aa': 'H',	'seq_num': '71', 'mutation_aa': 'F', 'reason': 'PPI. Chain C missing in 4JU8'},
    # 5DEI 	4K9K 	H 	Y 	282
#     {'pdb_wild': '5DEI', 'pdb_mutant': '4K9K', 'wild_chain': 'A', 'wild_aa': 'H',	'seq_num': '282', 'mutation_aa': 'Y', 'reason': 'PPI. Chain C missing in 4K9K'},
    # 5DEI 	5DGT 	H 	A 	71
#     {'pdb_wild': '5DEI', 'pdb_mutant': '5DGT', 'wild_chain': 'A', 'wild_aa': 'H',	'seq_num': '71', 'mutation_aa': 'A', 'reason': 'PPI. Chain C missing in 5DGT'},
    # 5DEI 	4JU9 	H 	L 	71
#     {'pdb_wild': '5DEI', 'pdb_mutant': '4JU9', 'wild_chain': 'A', 'wild_aa': 'H',	'seq_num': '71', 'mutation_aa': 'L', 'reason': 'PPI. Chain C missing in 4JU9'},
    # 5DEI 	4JUA 	H 	S 	71
#     {'pdb_wild': '5DEI', 'pdb_mutant': '4JUA', 'wild_chain': 'A', 'wild_aa': 'H',	'seq_num': '71', 'mutation_aa': 'S', 'reason': 'PPI. Chain C missing in 4JUA'},
    # 5DEI 4GG1 	L 	T 	404 	shanthirabalan 	A
#     {'pdb_wild': '5DEI', 'pdb_mutant': '4GG1', 'wild_chain': 'A', 'wild_aa': 'L',	'seq_num': '404', 'mutation_aa': 'T', 'reason': 'PPI. Chain C missing in 4GG1'},
    # 5DEI 	4GM0 	L 	N 	404 	shanthirabalan 	A
#     {'pdb_wild': '5DEI', 'pdb_mutant': '4GM0', 'wild_chain': 'A', 'wild_aa': 'L',	'seq_num': '404', 'mutation_aa': 'N', 'reason': 'PPI. Chain C missing in 4GM0'},
    # 5DEI 	4GM1 	L 	S 	404 	shanthirabalan 	A
#     {'pdb_wild': '5DEI', 'pdb_mutant': '4GM1', 'wild_chain': 'A', 'wild_aa': 'L',	'seq_num': '404', 'mutation_aa': 'S', 'reason': 'PPI. Chain C missing in 4GM1'},
    # 5DEI 	4GM4 	L 	I 	404 	shanthirabalan 	A
#     {'pdb_wild': '5DEI', 'pdb_mutant': '4GM4', 'wild_chain': 'A', 'wild_aa': 'L',	'seq_num': '404', 'mutation_aa': 'I', 'reason': 'PPI. Chain C missing in 4GM4'},
    # 5DEI 	4GP9 	L 	F 	404 	shanthirabalan 	A
#     {'pdb_wild': '5DEI', 'pdb_mutant': '4GP9', 'wild_chain': 'A', 'wild_aa': 'L',	'seq_num': '404', 'mutation_aa': 'F', 'reason': 'PPI. Chain C missing in 4GP9'},
    # 5DEI 	4GPE 	L 	M 	404 	shanthirabalan 	A
#     {'pdb_wild': '5DEI', 'pdb_mutant': '4GPE', 'wild_chain': 'A', 'wild_aa': 'L',	'seq_num': '404', 'mutation_aa': 'M', 'reason': 'PPI. Chain C missing in 4GPE'},
    # 5DEI 	4JD5 	L 	E 	404 	shanthirabalan 	A
#     {'pdb_wild': '5DEI', 'pdb_mutant': '4JD5', 'wild_chain': 'A', 'wild_aa': 'L',	'seq_num': '404', 'mutation_aa': 'E', 'reason': 'PPI. Chain C missing in 4JD5'},


    # platinum (note wild-type PDB files are custom in this dataset; not plain from PDB)
    # 1U5B 	1X7X 	S 	E 	292
    {'pdb_wild': '1U5B', 'pdb_mutant': '1X7X', 'wild_chain': 'A', 'wild_aa': 'S',	'seq_num': '292', 'mutation_aa': 'E', 'reason': 'Specified position is not resolved in 1X7X'},
    # 1U5B 	1X7Y 	S 	N 	292
    {'pdb_wild': '1U5B', 'pdb_mutant': '1X7Y', 'wild_chain': 'A', 'wild_aa': 'S',	'seq_num': '292', 'mutation_aa': 'N', 'reason': 'Specified position is not resolved in 1X7Y'},
    # 1U5B 	1X7W 	S 	Q 	292
    {'pdb_wild': '1U5B', 'pdb_mutant': '1X7W', 'wild_chain': 'A', 'wild_aa': 'S',	'seq_num': '292', 'mutation_aa': 'Q', 'reason': 'Specified position is not resolved in 1X7W'},
    # 2CA8 	2F8F 	Y 	F 	10
    {'pdb_wild': '2CA8', 'pdb_mutant': '2F8F', 'wild_chain': 'A', 'wild_aa': 'Y',	'seq_num': '10', 'mutation_aa': 'F', 'reason': 'terminal A207,A208,T209,P210,F211 unresolved in 2F8F'},
    # 3DTB 	3MOH 	A 	G 	467
    {'pdb_wild': '3DTB', 'pdb_mutant': '3MOH', 'wild_chain': 'A', 'wild_aa': 'A',	'seq_num': '467', 'mutation_aa': 'G', 'reason': 'Specified position is not resolved in 3MOH'},
    # 3NU3 	3NU5 	I 	V 	50
    {'pdb_wild': '3NU3', 'pdb_mutant': '3NU5', 'wild_chain': 'A', 'wild_aa': 'I',	'seq_num': '50', 'mutation_aa': 'V', 'reason': 'PPI: near other mutation chain B I 150 V'},
    # 3OXC 	3CYX 	I 	V 	50
    {'pdb_wild': '3OXC', 'pdb_mutant': '3CYX', 'wild_chain': 'A', 'wild_aa': 'I',	'seq_num': '50', 'mutation_aa': 'V', 'reason': 'PPI: near other mutation chain B I 150 V'},
    # 2Z4O 	2QD6 	I 	V 	50
    {'pdb_wild': '2Z4O', 'pdb_mutant': '2QD6', 'wild_chain': 'A', 'wild_aa': 'I',	'seq_num': '50', 'mutation_aa': 'V', 'reason': 'PPI: near other mutation chain B I 150 V'},
    # 2JBZ 	2WDY 	D 	A 	111
    {'pdb_wild': '2JBZ', 'pdb_mutant': '2WDY', 'wild_chain': 'A', 'wild_aa': 'D',	'seq_num': '111', 'mutation_aa': 'A', 'reason': 'PPI. Chain C missing in 2WDY'},
    # 3DT7 	3MOE 	A 	G 	467
    {'pdb_wild': '3DT7', 'pdb_mutant': '3MOE', 'wild_chain': 'A', 'wild_aa': 'A',	'seq_num': '467', 'mutation_aa': 'G', 'reason': 'Specified position is not resolved in 3MOE'},
    # 1CNQ 	1YXI 	A 	L 	54
    {'pdb_wild': '1CNQ', 'pdb_mutant': '1YXI', 'wild_chain': 'A', 'wild_aa': 'A',	'seq_num': '54', 'mutation_aa': 'L', 'reason': 'PPI. Chain C missing in 1YXI'},
    # 2TDM 	1TSY 	R 	K 	179
    {'pdb_wild': '2TDM', 'pdb_mutant': '1TSY', 'wild_chain': 'A', 'wild_aa': 'R',	'seq_num': '179', 'mutation_aa': 'K', 'reason': 'PPI. Chain B missing in 1TSY'},
    # 2TDM 	1TSV 	R 	A 	179
    {'pdb_wild': '2TDM', 'pdb_mutant': '1TSV', 'wild_chain': 'A', 'wild_aa': 'R',	'seq_num': '179', 'mutation_aa': 'A', 'reason': 'PPI. Chain B missing in 1TSV'},
    # 1AMK 	1QDS 	E 	Q 	65
    {'pdb_wild': '1AMK', 'pdb_mutant': '1QDS', 'wild_chain': 'A', 'wild_aa': 'E',	'seq_num': '65', 'mutation_aa': 'Q', 'reason': 'PPI. Chain B missing in 1QDS'},
    # 2JBZ 	2WDS 	H 	A 	110 	platinum 	A
    {'pdb_wild': '2JBZ', 'pdb_mutant': '2WDS', 'wild_chain': 'A', 'wild_aa': 'H',	'seq_num': '110', 'mutation_aa': 'A', 'reason': 'PPI. Chain C missing in 2WDS'},


]
df_problematic_protherm2 = pd.DataFrame(prob_dict).astype(dtypes_dict)
df_problematic_protherm2

Unnamed: 0,pdb_wild,pdb_mutant,wild_chain,wild_aa,seq_num,mutation_aa,reason
0,1BNI,1BGD,A,I,96,V,different proteins
1,2RN2,1RBN,A,K,95,N,different proteins
2,1LZ1,1YAG,A,I,89,V,different proteins
3,1LZ1,1GA2,A,V,2,I,different proteins
4,1LZ1,1GA0,A,V,2,L,different proteins
...,...,...,...,...,...,...,...
67,1CNQ,1YXI,A,A,54,L,PPI. Chain C missing in 1YXI
68,2TDM,1TSY,A,R,179,K,PPI. Chain B missing in 1TSY
69,2TDM,1TSV,A,R,179,A,PPI. Chain B missing in 1TSV
70,1AMK,1QDS,A,E,65,Q,PPI. Chain B missing in 1QDS


In [9]:
pd.concat(
    [df_problematic, df_problematic_protherm2]
#       [df_problematic_protherm2]
).drop_duplicates().to_csv(root_dir / 'data' / 'problematic_mutations5.tsv', sep='\t', header=True, index=False)