# Searching SQL databases and analyzing results

This notebook searches the created databases and analyses the results. The code here is written to work specifically for the cases presented in the publication. Only databases created with the datasets and code provided together with this notebook will generate the correct result. It would require  modification for the code to work with other datasets and databases.

Rijksuniversiteit Groningen, 2018

c.m.punter@rug.nl - Interfacing with postgreSQL, SQL translator, test, query, stats, conversion of query results, counting unique interactors and pairs, comments.


w.m.smigiel@gmail.com - Counting interactions, checking GeneOntology, detailed statistics per unique interaction, exports, processing with pandas, plotting, comments, markdown, conversion to notebook.

## Imports, connecting to postgreSQL and assigning variables

In [1]:
# importing necessary packages

import psycopg2
import pandas as pd
import os

In [2]:
# username, password and database as defined in the postgreSQL system

username = 'postgres'
password = 'password'
database = 'intact_20201208'

# connecting to the database

conn = psycopg2.connect("user=%s password='%s' dbname='%s'" % (username, password, database))

In [3]:
# GO id's assigning a proteing to be located in the cytoplasm. Refer to http://geneontology.org/

cytosol_goid = ['GO:0005737', 'GO:0005829']

# relevant columns of the IntAct dataset

columns = [
    'interactor_a',
    'interactor_b',
    'alt_interactor_a',
    'alt_interactor_b',
    'alias_interactor_a',
    'alias_interactor_b',
    'detection_method',
    'author',
    'publication',
    'taxonomy_a',
    'taxonomy_b',
    'interaction_type',
    'source_database',
    'interaction',
    'confidence_score',
    'xref_interactor_a',
    'xref_interactor_b',
]

# all columns to be fetched from the SQL database

all_columns = [
    'interactor_a',
    'interactor_b',
    'alt_interactor_a',
    'alt_interactor_b',
    'alias_interactor_a',
    'alias_interactor_b',
    'detection_method',
    'author',
    'publication',
    'taxonomy_a',
    'taxonomy_b',
    'interaction_type',
    'source_database',
    'interaction',
    'confidence_score',
    'xref_interactor_a',
    'xref_interactor_b',

    'sabu_Uniprot_a',
    'sabu_Description_a',
    'sabu_Gene_a',
    'sabu_Peptides_a',
    'sabu_Confidence_score_a',
    'weight_a',
    'sabu_Dataset_a',
    'sabu_Glycerol_number_of_proteins_per_cell_a',
    'sabu_Glycerol_fg_protein_per_cell_a',
    'sabu_Glycerol_coeffcient_of_variance_a',
    'sabu_Bnumber_a',
    'sabu_Annotated_functional_COG_groups_a',
    'sabu_Annotated_functional_COG_group_a',
    'sabu_Annotated_functional_COG_class_a'

    'sabu_Uniprot_b',
    'sabu_Description_b',
    'sabu_Gene_b',
    'sabu_Peptides_b',
    'sabu_Confidence_score_b',
    'weight_b',
    'sabu_Dataset_b',
    'sabu_Glycerol_number_of_proteins_per_cell_b',
    'sabu_Glycerol_fg_protein_per_cell_b',
    'sabu_Glycerol_coeffcient_of_variance_b',
    'sabu_Bnumber_b',
    'sabu_Annotated_functional_COG_groups_b',
    'sabu_Annotated_functional_COG_group_b',
    'sabu_Annotated_functional_COG_class_b'
]

abundance_columns = [
    'sabu_Uniprot',
    'sabu_Description',
    'sabu_Gene',
    'sabu_Peptides',
    'sabu_Confidence_score',
    'weight',
    'sabu_Dataset',
    'sabu_Glycerol_number_of_proteins_per_cell',
    'sabu_Glycerol_fg_protein_per_cell',
    'sabu_Glycerol_coeffcient_of_variance',
    'sabu_Bnumber',
    'sabu_Annotated_functional_COG_groups',
    'sabu_Annotated_functional_COG_group',
    'sabu_Annotated_functional_COG_class'
]

# columns for pandas outputs

columns_output = [
    'unique_interactor',
    'gene_name',
    'interactions_other',
    'interacions_self',
    'interactions_all',
    'abundance', 
    'MW',
    'go_cytoplasm',
    'membrane_in_go_all',
    'periplasm_in_go_all',
    'DNA_in_go_all',
    'RNA_in_go_all',
    'ribosome_in_go_all',
    'has_pdb',
    'go_all',
    'sabu_group_letter',
    'sabu_group_description',
    'sabu_COG_class',
    'unique_interactor',
    'all_interactors_abundance',
    'sum_abundance_inteactors'
]

## Functions

In [4]:
# function to assign path to export files

folder_file = lambda x: os.path.join(os.getcwd(), *x)

# functions for more convenient SQL query syntax

def sql_query(sql_select, sql_where):
    sql_from = 'interactions inner join interaction_identifier on interactions.id = interaction_identifier.interaction_id '
    sql_from += '\n\tinner join identifiers on interaction_identifier.identifier_id = identifiers.id '
    sql_from += '\n\tleft outer join databases on identifiers.database_id = databases.id '
    sql_from += '\n\tleft outer join abundance on identifiers.id = abundance.identifier_id'

    return 'select %s\nfrom %s\nwhere %s\n' % (', '.join(sql_select), sql_from, sql_combine('and', sql_where))


def sql_combine(operator, *queries):
    return '(%s)' % ((' %s\n\t' % operator).join(queries)) if queries else 'True'


def sql_identifier(identifier, condition):
    return "type = '%s' and identifier %s" % (identifier, condition)


def sql_interactions(*conditions):
    queries = []

    for condition in conditions:
        queries.append(sql_query(['interactions.id'], condition))

    sql_select = [
        'interactions.id',
        'confidence_score',
        'type',
        'name',
        'identifier',
        'ab',
        'sabu_Uniprot',
        'sabu_Description',
        'sabu_Gene',
        'sabu_Peptides',
        'sabu_Confidence_score',
        'weight',
        'sabu_Dataset',
        'sabu_Glycerol_number_of_proteins_per_cell',
        'sabu_Glycerol_fg_protein_per_cell',
        'sabu_Glycerol_coeffcient_of_variance',
        'sabu_Bnumber',
        'sabu_Annotated_functional_COG_groups',
        'sabu_Annotated_functional_COG_group',
        'sabu_Annotated_functional_COG_class'
    ]

    query = sql_query(sql_select, 'interactions.id in (\n%s)' % 'intersect\n'.join(queries))

#     with open('query_sql.txt', 'wb') as f:
#         f.write(query.encode('utf-8'))
    cur = conn.cursor()
    cur.execute(query)
    interactions = cur.fetchall()
    cur.close()

    return interactions

# converting the SQL search results to table format, MITAB 2.7-like

def results_to_table(results):
    interactions = {}

    for id, confidence_score, type, database, identifier, ab, *abundance_values in results:

        if id not in interactions:
            interactions[id] = {}

        if ab:
            type = '%s_%s' % (type, ab)

            for column, value in zip(abundance_columns, abundance_values):
                if value:
                    name = '%s_%s' % (column, ab)
                    interactions[id][name] = [str(value)]

        if type not in interactions[id]:
            interactions[id][type] = []

        interactions[id][type].append('%s:%s' % (database, identifier))

    return interactions

# converting the SQL search results to a more readable table format for export

def results_to_csv(results, columns=all_columns, delimiter='\t'):

    interactions = {}

    for id, confidence_score, type, database, identifier, ab, *abundance_values in results:
        if id not in interactions:
            interactions[id] = {}

        if ab:
            type = '%s_%s' % (type, ab)

        if type not in interactions[id]:
            interactions[id][type] = []

        interactions[id][type].append('%s:%s' % (database, identifier))

        if confidence_score:
            interactions[id]['confidence_score'] = ['%f' % confidence_score]
        else:
            interactions[id]['confidence_score'] = ['']

        for column, value in zip(abundance_columns, abundance_values):

            if ab:

                if value:
                    name = '%s_%s' % (column, ab)
                    interactions[id][name] = [str(value)]

    rows = [columns]

    for key in interactions.keys():
        values = []
        for column in columns:
            if column not in interactions[key]:
                values.append('')
            else:
                values.append('|'.join(interactions[key][column]))

        rows.append(values)

    return rows

# testing database integrity

def test_integrity():
    # check if each line of abundance data is linked to a single protein
    # the following query should be true
    query = 'select count(distinct identifier_id) = count(identifier_id) '
    query += 'from abundance '
    query += 'where identifier_id is not NULL '

    cur = conn.cursor()
    cur.execute(query)
    result = cur.fetchone()
    if result[0] != True:
        print('query was not True : ' + query)
    else:
        print('each abundance line is connected to a single protein ID - passed check query')
    cur.close()

    # check if all interactions have 2 different interactors
    # and if each interactor has only one unique identifier (should be the case since we only use uniprot data)
    # the max count returned should be 2!

    query = 'select max(count) = 2 '
    query += 'from ( '
    query += 'select count(interaction_id) '
    query += 'from interaction_identifier '
    query += 'left join identifiers on interaction_identifier.identifier_id = identifiers.id '
    query += "where identifiers.type = 'interactor' "
    query += 'group by interaction_id '
    query += 'order by count desc '
    query += ') as counts '

    cur = conn.cursor()
    cur.execute(query)
    result = cur.fetchone()
    if result[0] != True:
        print('query was not True : ' + query)
    else:
        print('passed check query')
    cur.close()

# counting unique abundances

def count_uniques(results, *columns):
    counts = dict()

    for id, confidence_score, type, database, identifier, ab, *abundance_values in results:

        all_values = dict(zip(abundance_columns, abundance_values))
        all_values['id'] = id
        all_values['ab'] = ab
        all_values['database'] = database
        all_values[type] = identifier

        # check if all columns are present
        if all(i in all_values for i in columns):
            key = tuple([all_values[c] for c in columns])

            if key in counts:
                counts[key] += 1
            else:
                counts[key] = 1

    return counts

# function to generate short statics of the query results

def generate_stats(table):

    xx = list()
    xy = list()
    yx = list()
    all = set()

    for interaction in table.values():
        interactor_a = interaction['interactor_a'][0]
        interactor_b = interaction['interactor_b'][0]

        # test and warning for interactors with multiple names
        if len(interaction['interactor_a']) > 1 or len(interaction['interactor_b']) > 1:
            print('warning; some interactors have multiple names')

        interactors = (interactor_a, interactor_b)

        if interactor_a == interactor_b:
            xx.append(interactors)
        else:
            if (interactor_b, interactor_a) in xy:
                yx.append((interactor_b, interactor_a))
            else:
                xy.append(interactors)

        all.add(interactor_a)
        all.add(interactor_b)

    stats = [
        ('interactor pairs (interactions)', (len(xx + xy + yx))),
        ('unique interactor pairs (interactions)', (len(set(xx + xy + yx)))),
        ('XX', len(xx)),
        ('unique XX', len(set(xx))),
        ('XY', len(xy)),
        ('unique XY', len(set(xy))),
        ('YX', len(yx)),
        ('unique YX', len(set(yx))),
        ('number of unique interactors', len(all))
    ]
    return stats

# function to count unique interactors and generate a table

def count_unique_interactors(table):

    counts = []
    all_unique_interactors = set()
    for interaction in table.values():
        interactor_a = interaction['interactor_a'][0]
        interactor_b = interaction['interactor_b'][0]
        all_unique_interactors.add(interactor_a)
        all_unique_interactors.add(interactor_b)

    for interactor in all_unique_interactors:
        all = 0
        in_x = 0
        in_y = 0
        in_xx = 0

        for interaction in table.values():
            interactor_a = interaction['interactor_a'][0]
            interactor_b = interaction['interactor_b'][0]

            if interactor_a == interactor:
                in_x += 1
            elif interactor_b == interactor:
                in_y += 1

            if interactor_a == interactor or interactor_b == interactor:
                all += 1

                if interactor_a == interactor_b:
                    in_xx += 1

        counts.append((interactor, all, in_x, in_y, in_xx))
    return counts

# function to retrieve and count all unique protein pairs

def get_combinations(table):

    all_proteins_combinations = dict()

    for interaction in table.values():
        interactor_a = interaction['interactor_a'][0]
        interactor_b = interaction['interactor_b'][0]
        xy = (interactor_a, interactor_b)
        yx = (interactor_b, interactor_a)

        if xy in all_proteins_combinations:
            all_proteins_combinations[xy] += 1
        elif yx in all_proteins_combinations:
            all_proteins_combinations[yx] += 1
        else:
            all_proteins_combinations[xy] = 1
    return all_proteins_combinations

# function that counts number of all interactions, self-interactions, particular GO annotations.

def process_table(table, all_unique_interactors, all_proteins_combinations, cytosol_goid): 

    all_unique_interactors_list = list(all_unique_interactors)
    key_list = list(all_proteins_combinations.keys())
    interactor_unique_interaction_count = []

    for unique_interactor in all_unique_interactors_list:
        unique_interactor = unique_interactor[0]

        counter = 0
        selfint = 0
        counter_all = 0

        for pair in key_list:

            if unique_interactor == pair[0] and unique_interactor != pair[1]:
                counter += 1
            elif unique_interactor == pair[1] and unique_interactor != pair[0]:
                counter += 1
            elif unique_interactor == pair[1] and pair[1] == pair[0]:
                selfint += 1

        counter_all = counter + selfint

        go_list_cyt = []
        go_list_all = []

        for interaction in table.values():
            
            pdb = 0
            abundance = 0
            name = ''
            weight = ''
            letter = ''
            group = ''
            sabuclass = ''
            membrane = 0
            periplasm = 0
            rna = 0
            dna = 0
            ribosome = 0

            if unique_interactor == interaction['interactor_a'][0]:

                for xref in interaction['xref_interactor_a']:

                    if 'pdb' in xref:
                        pdb = 1

                    if 'go:' in xref:
                        go_list_all.append(xref)

                    for id in cytosol_goid:

                        if id in xref:
                            go_list_cyt.append(xref)

                if 'sabu_Glycerol_number_of_proteins_per_cell_a' in interaction:
                    abundance = interaction['sabu_Glycerol_number_of_proteins_per_cell_a'][0]
                    name = interaction['sabu_Gene_a'][0]
                    weight = interaction['weight_a'][0]

                if 'sabu_Annotated_functional_COG_group_a' in interaction:

                    letter = interaction['sabu_Annotated_functional_COG_groups_a'][0]
                    group = interaction['sabu_Annotated_functional_COG_group_a'][0]
                    sabuclass = interaction['sabu_Annotated_functional_COG_class_a'][0]

                    
                break

            elif unique_interactor == interaction['interactor_b'][0]:
                for xref in interaction['xref_interactor_b']:

                    if 'pdb' in xref:
                        pdb = 1

                    if 'go:' in xref:
                        go_list_all.append(xref)

                    for id in cytosol_goid:

                        if id in xref:
                            go_list_cyt.append(xref)

                if 'sabu_Glycerol_number_of_proteins_per_cell_b' in interaction:
                    abundance = interaction['sabu_Glycerol_number_of_proteins_per_cell_b'][0]
                    name = interaction['sabu_Gene_b'][0]
                    weight = interaction['weight_b'][0]

                if 'sabu_Annotated_functional_COG_group_b' in interaction:
                    letter = interaction['sabu_Annotated_functional_COG_groups_b'][0]
                    group = interaction['sabu_Annotated_functional_COG_group_b'][0]
                    sabuclass = interaction['sabu_Annotated_functional_COG_class_b'][0]

                break

        if not go_list_all:
            go_set_all = ''
        else:
            if any('MEMBRANE' in go.upper() for go in go_list_all):
                membrane = 1

            if any('PERIPLASM' in go.upper() for go in go_list_all):
                periplasm = 1

            if any('RNA' in go.upper() for go in go_list_all):
                rna = 1

            if any('DNA' in go.upper() for go in go_list_all):
                dna = 1

            if any('RIBOSOM' in go.upper() for go in go_list_all):
                ribosome = 1

            go_set_all = set(go_list_all)
        go_str_all = '|'.join(go_set_all)

        if not go_list_cyt:
            go_set = ''
        else:
            go_set = set(go_list_cyt)
        go_str = '|'.join(go_set)

        interactor_unique_interaction_count.append((unique_interactor, name, counter, selfint, counter_all, abundance, weight, go_str, membrane, periplasm, dna, rna, ribosome, pdb, go_str_all, letter,  group, sabuclass))
        
    return interactor_unique_interaction_count

# this function returns sum of abuntances of all interacting partners

def sum_partners(all_unique_interactors, all_proteins_combinations, interactor_unique_interaction_count):
    
    all_unique_interactors_list = list(all_unique_interactors)
    key_list = list(all_proteins_combinations.keys())
    interactor_unique_interaction_sums = []

    for interactor in all_unique_interactors_list:
        interactor = interactor[0]

        partners_abu = []
        partners = []

        for pair in key_list:

            if interactor == pair[0] and interactor != pair[1]:
                for line in interactor_unique_interaction_count:

                    if pair[1] == line[0]:
                        
                        if line[5] != '':
                            partners_abu.append(int(line[5]))
                        else:
                            partners_abu.append(float('nan'))
                            
                        tu_part = '%s:%s' % (line[0], line[5])
                        partners.append(tu_part)

            if interactor == pair[1] and interactor != pair[0]:           
                for line in interactor_unique_interaction_count:

                    if pair[0] == line[0]:
                        
                        if line[5] != '':
                            partners_abu.append(int(line[5]))
                        else:
                            partners_abu.append(float('nan'))
                            
                        tu_part = '%s:%s' % (line[0], line[5])
                        partners.append(tu_part)

        sum_partners_abu = sum(list(partners_abu))
        s_partners = ';'.join(partners)

        interactor_unique_interaction_sums.append((interactor, s_partners, sum_partners_abu))
        
    return interactor_unique_interaction_sums

## Testing the database integrity

In [5]:
test_integrity()

each abundance line is connected to a single protein ID - passed check query
passed check query


## Retrieving proteins with abundance above 1000 copies/cell

Here we filter the  data for all _E. coli_ proteins with abundance above 1000 copies per cell. 

The database is taken from Schmidt et al. 2015, supporting informations. We manually transfered relevant columns from the provided .xlsx file to a text-based table. 

See the publication text for more details.

In [6]:
# importing the Schmidt et al. database

sabu = pd.read_table('sabu.txt')

# changing column names to fit the output from the SQL queries

sabu_columns = [
    'sabu_Uniprot Accession',
    'sabu_Gene',
    'sabu_Molecular weight (Da)',
    'sabu_Glycerol_number_of_proteins_per_cell',
    'sabu_Annotated functional COG groups (letter)',
    'sabu_Annotated functional COG group (description)',
    'sabu_Annotated functional COG class'
]

out_columns = [
    'unique_interactor',
    'gene_name',
    'MW',
    'abundance', 
    'sabu_group_letter',
    'sabu_group_description',
    'sabu_COG_class'
]

sabu = sabu.rename(columns=dict(zip(sabu_columns, out_columns)))

# get all uniprot id's of proteins with abundance above 1000 copies/cell
    
sabu_above_1000 = sabu.loc[sabu['abundance'] >= 1000]

## Query - interactions of proteins above 1000 copies/cell

Here we are interested in retreiving interactome data for all _E. coli_ proteins with abundance above 1000 copies per cell. The list of UniProt ID's of such proteins was retrieved in the section above. We iterate through the list, and we search the postgreSQL databases in search of all of the interactions of the protein of interest.

The query takes data from entries that:
* have both interactors tagged with _E. coli_ taxonomy ID
* are described by interaction type physical interaction
* at least one of the interactor's UniProt ID matches with the one of the protein of interest

In [None]:
def individual_protein_query(uid):
    
    results = sql_interactions(
       sql_combine('or',
           sql_combine('and', sql_identifier('taxonomy', "like '%83333%'"), "ab = 'a'"),
           sql_combine('and', sql_identifier('taxonomy', "like '%83333%'"), "ab = 'b'"),
       ),
       sql_combine('or',
            sql_identifier('interaction_type', "like '%0914%'"),
            sql_identifier('interaction_type', "like '%0915%'")
        ),
        sql_combine('or',
            sql_combine('and', sql_identifier('interactor', f"like '{uid}'"), "ab = 'a'"),
            sql_combine('and', sql_identifier('interactor', f"like '{uid}'"), "ab = 'b'"),
        ),
    )
    return results

# creating empty dataframes for outputs

out = pd.DataFrame([])
no_interactions = pd.DataFrame([])  

counter = 0
total_searches = len(sabu_above_1000['unique_interactor'])

for uid, name in list(zip(sabu_above_1000['unique_interactor'], sabu_above_1000['gene_name'])):
    
    counter += 1
    
    print(uid, name, f'{counter} done out of {total_searches}', end='\r')
    
    # create a folder for each protein individually
    
    os.makedirs(folder_file(('individual_protein_results', name)), exist_ok=True)
    
    # for a short description of what the functions are doing refer to the section above.
    
    results = individual_protein_query(uid)
    table = results_to_table(results)
    csv_output = results_to_csv(results)
    stats = generate_stats(table)
    all_unique_interactors = count_unique_interactors(table)
    all_proteins_combinations = get_combinations(table)
    interactor_unique_interaction_count = process_table(table, all_unique_interactors, all_proteins_combinations, cytosol_goid)
    interactor_unique_interaction_sums = sum_partners(all_unique_interactors, all_proteins_combinations, interactor_unique_interaction_count)
    
    # outputs of the 'process_table' and 'sum_partners' functions are merged into a complete table
    
    df_interactor_unique_interaction_count = pd.DataFrame(interactor_unique_interaction_count,
                 columns=columns_output[:-3]
                )

    df_interactor_unique_interaction_sums = pd.DataFrame(interactor_unique_interaction_sums, 
                 columns=columns_output[-3:]
                )

    merged_interactor_unique_interaction = df_interactor_unique_interaction_count.merge(
        df_interactor_unique_interaction_sums,
        left_on='unique_interactor', right_on='unique_interactor'
    )
    
    # calculating loneliness = abundance of a protein divided by 
    # the sum of abundances of all it's interactors

    merged_interactor_unique_interaction['loneliness'] = merged_interactor_unique_interaction['abundance'].astype(float) / merged_interactor_unique_interaction['sum_abundance_inteactors']
    
    # appending line of the search protein to output
    # depending on whether any interactions were found
    
    try:
        out = pd.concat([out, merged_interactor_unique_interaction.loc[merged_interactor_unique_interaction['unique_interactor'].str.split(':', n=1, expand=True)[1] == uid]])
    except KeyError:
        no_interactions = pd.concat([no_interactions, sabu_above_1000.loc[sabu_above_1000['unique_interactor'] == uid]])
        
        print(uid, name, 'no interactions found.')
    
    # export to csv - intermediate search steps for each protein
    
    pd.DataFrame(csv_output[1:],columns=csv_output[0]).to_csv(folder_file(('individual_protein_results', name, f'{name}_query_results.csv')))
    
    pd.DataFrame(stats).to_csv(folder_file(('individual_protein_results', name, f'{name}_query_results_statistics.csv')))
    
    pd.DataFrame([(*ids, number) for ids, number in zip(all_proteins_combinations.keys(), all_proteins_combinations.values())]).to_csv(
        folder_file(('individual_protein_results', name, f'{name}_unique_protein_pairs.csv')))
    
    merged_interactor_unique_interaction.to_csv(folder_file(('individual_protein_results', name, f'{name}_unique_interactions_per_protein_loneliness_q.csv')))
    
# export to csv - data of proteins with abundance above 1000 without any interactions found

no_interactions.to_csv('no_interactions_found.csv')

# calculating quantiles of selected columns
# this creates easy to asses, sortable columns, where outlayers and average proteins
# can be easily selected

out['MW_decile'] = pd.qcut(out['MW'].astype(float), 10, labels=range(1, 11))

out['interactions_other_quintile'] = pd.qcut(out['interactions_other'], 5, labels=range(1,6))

out['abundance_decile'] = pd.qcut(out['abundance'].astype(float), 10, labels=range(1, 11))

out['loneliness_decile'] = pd.qcut(out['loneliness'].astype(float), 10, labels=range(1, 11))

# export to csv - data of proteins with abundance above 1000 with interactions found

out.sort_values('loneliness').to_csv('interactions_found.csv')

### Manually selected data

Based on the abundance, molecular weight and loneliness quantiles we manually picked a set of proteins that represent extreme and average cases. We produced a set of proteins varied in terms of all those parameters. Additinally we checked for availability of the C-terminus for tagging based on available structures, available knock-outs and the oligomeric state.

Note that _ppc_, _aceB_, _sucC_ and were excluded form the experimental analysis.
* _ppc_ failed to produce an expressing clone.
* _aceB_ C-terminus is used for activity. We attemped to tag the N-terminus of the protein, yet that failed to produce an expressing clone.
* _sucC_ forms a heterotetrameric complex with it's interactor, _sucD_. A dual-expression system would be required for this protein exist close to it's native state. That would create a significant difference in the cell state between this protein and others, so we decided not to include it in the study.

We were unable to acquire data for the following proteins due to aggregation:
* _metK_

For more details, see the publication text.

In [10]:
# the third position in each tuple is the oligomeric state of the product of each gene
# as followed from UniProt
# 2.5 for sucC is put in as a placeholder for displaying on a graph
# sucC is heterodimeric with sucD

manually_picked_proteins = [
    ('P0A9G6', 'aceA', 4),
    ('P00934', 'thrC', 1),
    ('P18843', 'nadE', 2),
    ('P0A836', 'sucC', 2.5),
    ('P0A9K9', 'slyD', 1),
    ('P0C0L2', 'osmC', 2),
    ('P0AC62', 'grxC', 1),
    ('P05793', 'ilvC', 4),
    ('P00864', 'ppc', 4),
    ('P08997', 'aceB', 1),
    ('P0A6A8', 'acpP', 1),
    ('P0ACC3', 'erpA', 2),
    ('P0A763', 'ndk', 4),
    ('P0AA25', 'trxA', 1),
    ('P07813', 'leuS', 1),
    ('P30125', 'leuB', 2),
    ('P0A817', 'metK', 4),
    ('P08200', 'icd', 2)
]

manually_picked_proteins = pd.DataFrame(manually_picked_proteins)

# creating indicators for proteins that weren't measured with microscopy

no_expression = ['sucC', 'ppc', 'aceB']

aggregating = ['metK']

# selecting out the manually picked proteins

picked_out = out.loc[out['unique_interactor'].str.split(':', n=1, expand=True)[1].isin(manually_picked_proteins[0])]

# adding oligomeric state data

picked_out = picked_out.merge( manually_picked_proteins[[1, 2]], left_on='gene_name', right_on=1).rename(columns={2 : 'oligomeric_state'}).drop(1, axis=1)

# creating aggregation/failure to express columns

picked_out['has_construct'] = 0
picked_out.loc[~picked_out['gene_name'].isin(no_expression), 'has_construct'] = 1

picked_out['aggregates'] = 0
picked_out.loc[~picked_out['gene_name'].isin(aggregating), 'aggregates'] = 1


# export to csv - data of manually picked proteins

picked_out.to_csv('manually_selected.csv')

In [None]:
def all_proteins_query(uid_all):
    
    results_all = sql_interactions(
       sql_combine('or',
           sql_combine('and', sql_identifier('taxonomy', "like '%83333%'"), "ab = 'a'"),
           sql_combine('and', sql_identifier('taxonomy', "like '%83333%'"), "ab = 'b'"),
       ),
       sql_combine('or',
            sql_identifier('interaction_type', "like '%0914%'"),
            sql_identifier('interaction_type', "like '%0915%'")
        ),
        sql_combine('or',
            sql_combine('and', sql_identifier('interactor', f"like '{uid_all}'"), "ab = 'a'"),
            sql_combine('and', sql_identifier('interactor', f"like '{uid_all}'"), "ab = 'b'"),
        ),
    )
    return results_all

# creating empty dataframes for outputs

out_all = pd.DataFrame([])
no_interactions_all = pd.DataFrame([])  


total_searches_all = len(sabu)

for i in sabu.index:
        
    uid_all, name_all = sabu.loc[i, ['unique_interactor', 'gene_name']]

    if type(name_all) == float:
        name_all = uid_all

    # create a folder for each protein individually

    os.makedirs(folder_file(('all_proteins_results', f'{i}_{name_all}')), exist_ok=True)

    # for a short description of what the functions are doing refer to the section above.

    results_all = all_proteins_query(uid_all)
    table_all = results_to_table(results_all)
    csv_output_all = results_to_csv(results_all)
    stats_all = generate_stats(table_all)
    all_unique_interactors_all = count_unique_interactors(table_all)
    all_proteins_combinations_all = get_combinations(table_all)
    interactor_unique_interaction_count_all = process_table(table_all, all_unique_interactors_all, all_proteins_combinations_all, cytosol_goid)
    interactor_unique_interaction_sums_all = sum_partners(all_unique_interactors_all, all_proteins_combinations_all, interactor_unique_interaction_count_all)

    # outputs of the 'process_table' and 'sum_partners' functions are merged into a complete table

    df_interactor_unique_interaction_count_all = pd.DataFrame(interactor_unique_interaction_count_all,
                 columns=columns_output[:-3]
                )

    df_interactor_unique_interaction_sums_all = pd.DataFrame(interactor_unique_interaction_sums_all, 
                 columns=columns_output[-3:]
                )



    interactor_unique_interaction_count_all = df_interactor_unique_interaction_count_all.merge(
        df_interactor_unique_interaction_sums_all,
        left_on='unique_interactor', right_on='unique_interactor'
    )

    # calculating loneliness = abundance of a protein divided by 
    # the sum of abundances of all it's interactors

    interactor_unique_interaction_count_all['loneliness'] = interactor_unique_interaction_count_all['abundance'].astype(float) / interactor_unique_interaction_count_all['sum_abundance_inteactors']

    # appending line of the search protein to output
    # depending on whether any interactions were found

    try:
        out_all = pd.concat([out_all, interactor_unique_interaction_count_all.loc[interactor_unique_interaction_count_all['unique_interactor'].str.split(':', n=1, expand=True)[1] == uid_all]])
    except KeyError:
        no_interactions_all = pd.concat([no_interactions_all, sabu_above_1000.loc[sabu_above_1000['unique_interactor'] == uid_all]])

        print(uid_all, name_all, 'no interactions found.')

    # export to csv - intermediate search steps for each protein

    pd.DataFrame(csv_output_all[1:],columns=csv_output_all[0]).to_csv(folder_file(('all_proteins_results', f'{i}_{name_all}', f'{name_all}_query_results.csv')))

    pd.DataFrame(stats_all).to_csv(folder_file(('all_proteins_results', f'{i}_{name_all}', f'{name_all}_query_results_statistics.csv')))

    pd.DataFrame([(*ids, number) for ids, number in zip(all_proteins_combinations_all.keys(), all_proteins_combinations_all.values())]).to_csv(
        folder_file(('all_proteins_results', f'{i}_{name_all}', f'{name_all}_unique_protein_pairs.csv')))

    interactor_unique_interaction_count_all.to_csv(folder_file(('all_proteins_results', f'{i}_{name_all}', f'{name_all}_unique_interactions_per_protein_loneliness_q.csv')))
    
    print(uid_all, name_all, f'{i + 1} done out of {total_searches_all}', end='\r')
    
# export to csv - data of proteins with abundance above 1000 without any interactions found

no_interactions_all.to_csv('no_interactions_found_all.csv')

# calculating quantiles of selected columns
# this creates easy to asses, sortable columns, where outlayers and average proteins
# can be easily selected

out_all['MW_decile'] = pd.qcut(out_all['MW'].astype(float), 10, labels=range(1, 11))

out_all['interactions_other_quintile'] = pd.qcut(out_all['interactions_other'], 5, labels=range(1,6))

out_all['abundance_decile'] = pd.qcut(out_all['abundance'].astype(float), 10, labels=range(1, 11))

out_all['loneliness_decile'] = pd.qcut(out_all['loneliness'].astype(float), 10, labels=range(1, 11))

# export to csv - data of proteins with abundance above 1000 with interactions found

out_all.sort_values('loneliness').to_csv('interactions_found_all.csv')

P0ADB7 ecnB no interactions found.
P0AF96 tabA no interactions found.
P09169 ompT no interactions found.
P0AAX6 mcbA no interactions found.
P63224 gmhA no interactions found.
P16700 cysP 261 done out of 2359