In [1]:
%run 'pymol_and_pdb_functions.py'
start_pymol()

In [2]:
import itertools

%run 'ks01_Functions_only.ipynb'
notebook_prefix = 'ks16'
image_counter = Counter()



### Reading 24-mer structure

In [3]:
his3_aligned_to_4lom_assembly_file = os.path.join(structure_predictions_folder, 'his3_24mer_assembly', 
                                                  'his3_swiss_aligned_to_4lom_assembly.pdb')
structure = Bio.PDB.PDBParser().get_structure('his3_swiss_assembly', his3_aligned_to_4lom_assembly_file)
model = structure[0]



### Calculating distances

In [4]:
distance_dict = OrderedDict()
prefix = 'res_'

constant_chain = model['A']
positions_in_crystal = [r.id[1] for r in constant_chain.get_residues() if r.get_resname() in aa3]

f = FloatProgress(min=0, max=len(list(constant_chain.get_residues())))
display(f)

for residue in constant_chain.get_residues():
    if residue.get_resname() in aa3:
        distance_dict[prefix + '%s' %residue.id[1]] = OrderedDict()

        # distances to other residues
        for position in positions_in_crystal:
            distances = []
            for other_chain in model.get_chains():
                distances.append(get_distance_between_residues(residue, other_chain[position]))
            distance_dict[prefix + '%s' %residue.id[1]][prefix + '%s' %position] = min(distances)

        # distances to Mn ions
        for position in [302, 303, 304]:
            distances = []
            for other_chain in model.get_chains():
                other_residue = [r for r in other_chain if position == r.id[1]][0]
                distances.append(get_distance_between_residues(residue, other_residue))
            distance_dict[prefix + '%s' %residue.id[1]]['Mn_' + '%s' %position] = min(distances)
        distance_dict[prefix + '%s' %residue.id[1]]['Mn_substrate_bound'] = min(
            distance_dict[prefix + '%s' %residue.id[1]]['Mn_' + '302'],
            distance_dict[prefix + '%s' %residue.id[1]]['Mn_' + '303'])
            
        # distances to substrate
        for position in [301]:
            distances = []
            for other_chain in model.get_chains():
                other_residue = [r for r in other_chain if position == r.id[1]][0]
                distances.append(get_distance_between_residues(residue, other_residue))
            distance_dict[prefix + '%s' %residue.id[1]]['substrate'] = min(distances)
    f.value += 1

In [5]:
structural_data = pd.DataFrame.from_dict(distance_dict, orient='index')
new_index = sorted(structural_data.index.values, key=lambda s: int(s[4:]))
structural_data = structural_data.reindex(new_index)
structural_data['position'] = structural_data.index.map(lambda s: int(s[4:]))

### Secondary structure

In [21]:
# information from PyMol?
helices = range (60,74) + range (89,109) + range (157,172) + range (185,205)
sheets = range(5,12) + range(15,23) + range(52,55) + range(76,84) + range(117,125) + range(127,135) + range(138,145) + range(173,182)
disordered = [position for position in range(1, len(Scer_Uniprot)) if position not in helices and position not in sheets]

def get_secondary_structure(position):
#     print position, position in helices, position in sheets
    assert ~(position in helices and position in sheets)
    if position in helices:
        return 'helix'
    elif position in sheets:
        return 'sheet'
    else:
        return 'disordered'

structural_data['secondary_structure'] = structural_data['position'].apply(get_secondary_structure)

### Finding interface residues

In [None]:
ascii_letters_upper = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

cmd.reinitialize()
cmd.load(his3_aligned_to_4lom_assembly_file, 'his3_swiss_assembly')

interfaces = {}
f = FloatProgress(min=0, max=len(list(itertools.combinations(range(24), 2))))
display(f)
for chain_index1, chain_index2 in (itertools.combinations(range(24), 2)):
    chain1, chain2 = ascii_letters_upper[chain_index1], ascii_letters_upper[chain_index2]
    returned = interfaceResidues('his3_swiss_assembly', cA = 'c. %s' %chain1, cB = 'c. %s' %chain2)
    interfaces[chain1, chain2] = returned
    f.value += 1
    
interface_residues = []
for k,v in interfaces.items():
    interface_residues.extend([e[1] for e in v])
interface_residues = sorted(list(set([int(p) for p in interface_residues])))

structural_data['interface'] = structural_data.index.map(lambda s: int(s[4:]) in interface_residues)

### Reading data on sign epistasis from Lucas

In [7]:
# Relative >>> absolute position
position_translation = pd.read_table(files_dump_folder + 'position_translation.csv')
position_translation.set_index('relative_position', inplace=True)

def get_absolute_position(segment_number, relative_position):
    return int(position_translation.iloc[relative_position]['S'+str(segment_number)])

In [8]:
lucas_sign_epistasis = pd.read_csv(files_dump_folder + 'sign_epistasis/' + 'lucas_sign_epistasis.csv')
lucas_reciprocal_sign_epistasis = pd.read_csv(files_dump_folder + 'sign_epistasis/' + 'lucas_reciprocal_sign_epistasis.csv')

p_value_threshold = 0.01
filtered = lucas_sign_epistasis[lucas_sign_epistasis['pBon'] < p_value_threshold]
sign_epistasis_positions = set.union(set(filtered['VarPos_absolute'].values), 
                                                set(filtered['SubPos_absolute'].values))
sign_epistasis_positions = sorted([int(s) for s in sign_epistasis_positions])
print len(sign_epistasis_positions), 'positions under sign epistasis'

reciprocal_sign_epistasis_positions = set.union(set(lucas_reciprocal_sign_epistasis['position1'].values), 
                                                set(lucas_reciprocal_sign_epistasis['position1'].values))
reciprocal_sign_epistasis_positions = sorted([int(s) for s in reciprocal_sign_epistasis_positions])
print len(reciprocal_sign_epistasis_positions), 'positions under reciprocal sign epistasis'

structural_data['lucas_sign_epistasis'] = structural_data.index.map(lambda s: int(s[4:]) in sign_epistasis_positions)
structural_data['lucas_reciprocal_sign_epistasis'] = structural_data.index.map(lambda s: int(s[4:]) in reciprocal_sign_epistasis_positions)

94 positions under sign epistasis
59 positions under reciprocal sign epistasis


### Saving files

In [9]:
structural_data.to_hdf(files_dump_folder + 'structural_data_for_predicted_24mer.hdf', 'data')
structural_data.reset_index().to_csv(files_dump_folder + 'structural_data_for_predicted_24mer.csv', index=False)