In [15]:
import pandas as pd
import os
from Bio import SeqIO
import ast

In [16]:
def find_folder_upstream(folder_name, max_iterations=50):
    current_folder = os.getcwd()
    counter = 0
    while os.path.basename(current_folder) != folder_name and counter < max_iterations:
        current_folder = os.path.dirname(current_folder)
        counter += 1
    if not counter < max_iterations:
        return None
    return current_folder

In [17]:
root_folder_name = 'HIS3InterspeciesEpistasis'
root_folder = find_folder_upstream(root_folder_name)
if not root_folder:
    print 'Did not find root folder for our github repository.\nPlease run "ks01" notebook from a script inside the HIS3InterspeciesEpistasis folder!'

data_folder = os.path.join(root_folder, 'Data', '')
small_tables_folder = os.path.join(root_folder, 'Data_Small_Tables', '')
analysis_folder = os.path.join(root_folder, 'Analysis', '')
karen_folder = os.path.join(analysis_folder, 'Karen', '')
figures_folder = os.path.join(karen_folder, 'figures', '')
files_dump_folder = os.path.join(karen_folder, 'files_dump', '')
structure_predictions_folder = os.path.join(files_dump_folder, 'structure_predictions', '')
structure_visualizations_folder = os.path.join(files_dump_folder, 'structure_visualizations', '')
pymol_sessions_folder = os.path.join(files_dump_folder, 'structure_visualizations/pymol_sessions/', '')

### Working with positions

In [18]:
def remove_gaps(seq):
    return ''.join([c for c in seq if c != '-'])

def get_wt_position(position_in_alignment):
    truncated_seq = remove_gaps(aligned_Scer[:position_in_alignment+1])
    return len(truncated_seq)

def relative_to_absolute_numbering(mutation, segment, df_with_data):
    display(df_with_data[segment][df_with_data[segment].mut_list == mutation][['mut_list', 'mut_list_Scer']])
    return df_with_data[segment][df_with_data[segment].mut_list == mutation].mut_list_Scer.values[0]

In [19]:
Scer_Uniprot = open(os.path.join(files_dump_folder, 'HIS3_saccharomyces_cerevisiae_from_Uniprot_P06633.txt')).read().rstrip()
alignment_file = os.path.join(small_tables_folder, 'aa_seq.txt')
for seq_record in SeqIO.parse(alignment_file, 'fasta'):
    if seq_record.id == 'Scer':
        aligned_Scer = str(seq_record.seq)
        break
assert remove_gaps(aligned_Scer) == Scer_Uniprot

In [20]:
positions = pd.read_table(os.path.join(small_tables_folder, 'positions.csv'))
positions = positions.set_index('Unnamed: 0').transpose().reset_index()
positions.rename(columns={'index' : 'segment', 'positions':'positions_alignment'}, inplace=True)
positions.reset_index(drop=True, inplace=True)
positions['positions_alignment'] = positions['positions_alignment'].apply(lambda s: ast.literal_eval(s))
positions['positions_Uniprot_P06633'] = positions.positions_alignment.apply(lambda l: [get_wt_position(p) for p in l])
segment_names = positions.segment.values # not explicitly sorted
positions.to_csv(os.path.join(files_dump_folder, 'information_about_segments.csv'), index=False)

Unnamed: 0,segment,wt1,positions_alignment,len1,len2,start1,end1,wt2,start2,end2,start_Scer,end_Scer,positions_Uniprot_P06633
0,S1,EALGAVRGVK,"[140, 141, 142, 143, 144, 145, 146, 147, 148, ...",10,10,140,149,EALSRAVVDL,160,169,106,135,"[106, 107, 108, 109, 110, 111, 112, 113, 114, ..."
1,S2,SNRPYAVVE,"[170, 171, 172, 173, 174, 175, 176, 177, 178, ...",9,9,170,178,LSCEMIPHF,189,197,136,163,"[136, 137, 138, 139, 140, 141, 142, 143, 144, ..."
2,S3,LGLQREKVGD,"[179, 180, 181, 182, 183, 184, 185, 186, 187, ...",10,7,179,188,LESFAEA,198,204,145,170,"[145, 146, 147, 148, 149, 150, 151, 152, 153, ..."


In [30]:
# getting segment numbers for every position
position_to_segment = {}
for segment in positions.segment:
    for position in positions.set_index('segment').loc[segment]['positions_Uniprot_P06633']:
        position_to_segment[position] = segment
        
def get_segment(position_or_mutation):
    if type(position_or_mutation) == int or type(position_or_mutation) == float:
        return position_to_segment[int(position_or_mutation)] 
    elif type(position_or_mutation) == str:
        return position_to_segment[int(position_or_mutation[:-1])] 
    
def get_segment_of_a_genotype(mut_comb):
    segments= []
    mutation1 = mut_comb.split(':')[0]
    position = int(mutation1[:-1])
    segment = get_segment(position)
    if len(mut_comb.split(':')) == 1:
        return segment
    segments.append(segment)
    for mutation in mut_comb.split(':'):
        if mutation != mutation1:
            if segment != get_segment(int(mutation[:-1])):
                segment = get_segment(int(mutation[:-1]))
                segments.append(segment)
            else:
                return segment

In [24]:
print '''
Variables: 

- positions:\t\t\ta pandas.DataFrame with information about position numbers, segments etc
- get_segment_of_a_genotype:\ta function that return the segment of genotype (the most likely segment\n\t\t\t\tbecause sometimes it's not possible to say for sure)
'''


Variables: 

- positions:			a pandas.DataFrame with information about position numbers, segments etc

- get_segment_of_a_genotype:	a function that return the segment of genotype (the most likely segment
				because sometimes it's not possible to say for sure)

