In [2]:
import pandas as pd
import os
from Bio import SeqIO
import ast

In [3]:
def find_folder_upstream(folder_name, max_iterations=50):
    current_folder = os.getcwd()
    counter = 0
    while os.path.basename(current_folder) != folder_name and counter < max_iterations:
        current_folder = os.path.dirname(current_folder)
        counter += 1
    if not counter < max_iterations:
        return None
    return current_folder

In [4]:
root_folder_name = 'HIS3InterspeciesEpistasis'
root_folder = find_folder_upstream(root_folder_name)
if not root_folder:
    print 'Did not find root folder for our github repository.\nPlease run "ks01" notebook from a script inside the HIS3InterspeciesEpistasis folder!'

data_folder = os.path.join(root_folder, 'Data', '')
small_tables_folder = os.path.join(root_folder, 'Data_Small_Tables', '')
analysis_folder = os.path.join(root_folder, 'Analysis', '')
karen_folder = os.path.join(analysis_folder, 'Karen', '')
figures_folder = os.path.join(karen_folder, 'figures', '')
files_dump_folder = os.path.join(karen_folder, 'files_dump', '')
structure_predictions_folder = os.path.join(files_dump_folder, 'structure_predictions', '')
structure_visualizations_folder = os.path.join(files_dump_folder, 'structure_visualizations', '')
pymol_sessions_folder = os.path.join(files_dump_folder, 'structure_visualizations/pymol_sessions/', '')

### Working with positions

In [5]:
def remove_gaps(seq):
    return ''.join([c for c in seq if c != '-'])

def get_wt_position(position_in_alignment):
    truncated_seq = remove_gaps(aligned_Scer[:position_in_alignment+1])
    return len(truncated_seq)

def relative_to_absolute_numbering(mutation, segment, df_with_data):
    display(df_with_data[segment][df_with_data[segment].mut_list == mutation][['mut_list', 'mut_list_Scer']])
    return df_with_data[segment][df_with_data[segment].mut_list == mutation].mut_list_Scer.values[0]

In [6]:
Scer_Uniprot = open(os.path.join(files_dump_folder, 'HIS3_saccharomyces_cerevisiae_from_Uniprot_P06633.txt')).read().rstrip()
alignment_file = os.path.join(small_tables_folder, 'aa_seq.txt')
for seq_record in SeqIO.parse(alignment_file, 'fasta'):
    if seq_record.id == 'Scer':
        aligned_Scer = str(seq_record.seq)
        break
assert remove_gaps(aligned_Scer) == Scer_Uniprot

In [7]:
positions = pd.read_table(os.path.join(small_tables_folder, 'positions.csv'))
positions = positions.set_index('Unnamed: 0').transpose().reset_index()
positions.rename(columns={'index' : 'segment', 'positions':'positions_alignment'}, inplace=True)
positions.reset_index(drop=True, inplace=True)
positions['positions_alignment'] = positions['positions_alignment'].apply(lambda s: ast.literal_eval(s))
positions['positions_Uniprot_P06633'] = positions.positions_alignment.apply(lambda l: [get_wt_position(p) for p in l])
segment_names = positions.segment.values # not explicitly sorted
positions.to_csv(os.path.join(files_dump_folder, 'information_about_segments.csv'), index=False)

In [9]:
positions[['wt1', 'wt2', 'positions_Uniprot_P06633']]

Unnamed: 0,wt1,wt2,positions_Uniprot_P06633
0,EALGAVRGVK,EALSRAVVDL,"[106, 107, 108, 109, 110, 111, 112, 113, 114, ..."
1,SNRPYAVVE,LSCEMIPHF,"[136, 137, 138, 139, 140, 141, 142, 143, 144, ..."
2,LGLQREKVGD,LESFAEA,"[145, 146, 147, 148, 149, 150, 151, 152, 153, ..."
3,SRITLHVDCL,SAFKALAVAI,"[171, 172, 173, 174, 175, 176, 177, 178, 179, ..."
4,RGKNDHHRSE,REATSPNGTND,"[181, 182, 183, 184, 185, 186, 187, 188, 189, ..."
5,CGIALGQAFK,RFGSGFAPLD,"[96, 97, 98, 99, 100, 101, 102, 103, 104, 105,..."
6,IHALAKHSGW,HIDDHHTTED,"[66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 86, 8..."
7,HTGIGFLDHM,SLIVECIGDL,"[56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 76, 7..."
8,PEKEAE,TQSQVINV,"[36, 37, 38, 39, 40, 41, 48, 49, 50, 51, 52, 5..."
9,IEHSIF,AVAEQA,"[30, 31, 32, 33, 34, 35, 42, 43, 44, 45, 46, 47]"


In [10]:
# getting segment numbers for every position
position_to_segment = {}
for segment in positions.segment:
    for position in positions.set_index('segment').loc[segment]['positions_Uniprot_P06633']:
        position_to_segment[position] = segment
        
def get_segment(position_or_mutation):
    if type(position_or_mutation) == int or type(position_or_mutation) == float:
        return position_to_segment[int(position_or_mutation)] 
    elif type(position_or_mutation) == str:
        return position_to_segment[int(position_or_mutation[:-1])] 
    
def get_segment_of_a_genotype(mut_comb):
    segments= []
    mutation1 = mut_comb.split(':')[0]
    position = int(mutation1[:-1])
    segment = get_segment(position)
    if len(mut_comb.split(':')) == 1:
        return segment
    segments.append(segment)
    for mutation in mut_comb.split(':'):
        if mutation != mutation1:
            if segment != get_segment(int(mutation[:-1])):
                segment = get_segment(int(mutation[:-1]))
                segments.append(segment)
            else:
                return segment

In [24]:
print '''
Variables: 

- positions:\t\t\ta pandas.DataFrame with information about position numbers, segments etc
- get_segment_of_a_genotype:\ta function that return the segment of genotype (the most likely segment\n\t\t\t\tbecause sometimes it's not possible to say for sure)
'''


Variables: 

- positions:			a pandas.DataFrame with information about position numbers, segments etc

- get_segment_of_a_genotype:	a function that return the segment of genotype (the most likely segment
				because sometimes it's not possible to say for sure)

