In [2]:
import pandas as pd
import numpy as np
import os

In [3]:
def find_folder_upstream(folder_name, max_iterations=50):
    current_folder = os.getcwd()
    counter = 0
    while os.path.basename(current_folder) != folder_name and counter < max_iterations:
        current_folder = os.path.dirname(current_folder)
        counter += 1
    if not counter < max_iterations:
        return None
    return current_folder

root_folder_name = 'HIS3InterspeciesEpistasis'
root_folder = find_folder_upstream(root_folder_name)
if not root_folder:
    print 'Did not find root folder for our github repository.\nPlease run "ks01" notebook from a script inside the HIS3InterspeciesEpistasis folder!'

data_folder = os.path.join(root_folder, 'Data', '')
small_tables_folder = os.path.join(root_folder, 'Data_Small_Tables', '')
analysis_folder = os.path.join(root_folder, 'Analysis', '')
karen_folder = os.path.join(analysis_folder, 'Karen', '')
figures_folder = os.path.join(karen_folder, 'figures', '')
files_dump_folder = os.path.join(karen_folder, 'files_dump', '')
structure_predictions_folder = os.path.join(files_dump_folder, 'structure_predictions', '')
structure_visualizations_folder = os.path.join(files_dump_folder, 'structure_visualizations', '')
pymol_sessions_folder = os.path.join(files_dump_folder, 'structure_visualizations/pymol_sessions/', '')

In [58]:
%run 'ks37_Working_with_positions.ipynb'


Variables: 

- positions:			a pandas.DataFrame with information about position numbers, segments etc

- get_segment_of_a_genotype:	a function that return the segment of genotype (the most likely segment
				because sometimes it's not possible to say for sure)



In [59]:
def get_full_mutation(mutation):
    if mutation == '' or not mutation > 0:
        return ''
    position = int(mutation[:-1])
    assert Scer_Uniprot[position-1] != mutation[-1]
    return Scer_Uniprot[position-1] + mutation

def convert_to_full_mutations(mut_combination):
    if mut_combination == '' or not mut_combination > 0:
        return ''
    return ':'.join(get_full_mutation(mutation) for mutation in mut_combination.split(':'))

In [60]:
Scer_Uniprot = open(os.path.join(files_dump_folder, 'HIS3_saccharomyces_cerevisiae_from_Uniprot_P06633.txt')).read().rstrip()


In [63]:
rosetta_folder = os.path.join(analysis_folder, 'Sasha', 'rosetta_runs', '')

ddG_set0 = pd.read_table(rosetta_folder + 'run-170503-results-with-explicit-ddG.csv')
ddG_set0['mut_number'] = ddG_set0['mut_list_Scer'].apply(lambda s: s.count(':')+1)
ddG_set0 = ddG_set0[ddG_set0['mut_number'] > 2]

ddG_set1 = pd.read_table(rosetta_folder + '170522/' + '170522_aws_ddg_talaris__mutations_to_predict_ddG__set1.csv')
ddG_set2 = pd.read_table(rosetta_folder + '170522/' + '170522_aws_ddg_talaris__mutations_to_predict_ddG__set2.csv')

predicted_ddG = pd.concat([ddG_set0, ddG_set2])
predicted_ddG.drop('aa_seq', inplace=True, axis=1)

predicted_ddG['mut_number'] = predicted_ddG['mut_list_Scer'].apply(lambda s: s.count(':')+1)
predicted_ddG['mut_list_Scer_full'] = predicted_ddG['mut_list_Scer'].apply(convert_to_full_mutations)

predicted_ddG['segment'] = predicted_ddG['mut_list_Scer'].apply(get_segment_of_a_genotype)

predicted_doubles = predicted_ddG[predicted_ddG['mut_number'] == 2].copy()
predicted_singles = predicted_ddG[predicted_ddG['mut_number'] == 1].copy()

In [64]:
predicted_singles['position'] = predicted_singles['mut_list_Scer'].apply(lambda m: int(m[:-1]))
by_position_mean_ddG = pd.DataFrame(predicted_singles.groupby('position')['ddG'].agg(np.mean))
by_position_var_ddG = pd.DataFrame(predicted_singles.groupby('position')['ddG'].agg(np.var))

predicted_ddG.set_index('mut_list_Scer', inplace=True)

In [None]:
predicted_ddG.drop_duplicates(subset='mut_list_Scer_full', keep='last', inplace=True)

In [65]:
def get_ddG(mutation):
    try:
        return predicted_ddG.loc[mutation]['ddG']
    except KeyError:
        return None

In [66]:
print '''
Variables: 
- predicted_ddG:\t\tPandas DataFrame with all calculated ddG (single, double and triple mutants (?)),
- predicted_doubles:\t\tdoubles only (Pandas DataFrame),
- predicted_singles:\t\tsingles only (Pandas DataFrame),
- get_ddG:\t\t\tfunction that returns predicted ddG for a genotype
- by_position_mean_ddG:\t\tmean ddG value at each position,
- by_position_var_ddG:\t\tvariance of predicted ddG at each position.
'''


Variables: 
- predicted_ddG:		Pandas DataFrame with all calculated ddG (single, double and triple mutants (?)),
- predicted_doubles:		doubles only (Pandas DataFrame),
- predicted_singles:		singles only (Pandas DataFrame),
- get_ddG:			function that returns predicted ddG for a genotype
- by_position_mean_ddG:		mean ddG value at each position,
- by_position_var_ddG:		variance of predicted ddG at each position.

