In [1]:
import pandas as pd
import numpy as np

In [2]:
#Read in SIG File for use
sig = pd.read_table("/home/maayanlab/Desktop/Projects/KEA3/Combined Dataset/Combinedsig.txt", header=None)

In [3]:
#Create engine for localhost
from sqlalchemy import create_engine
engine = create_engine('mysql+pymysql://root:systemsbiology@localhost:3306/test')

In [4]:
#Read inn species dataframe for use in program
species_dataframe = pd.read_sql_query('SELECT * FROM species', engine)
species_dataframe.head()

Unnamed: 0,id,species_name
0,1,Homo sapiens
1,2,Mus musculus


In [8]:
def SIG_to_Genes(sig):
    #Add species_fk to all of the interactions
    interaction_dataframe = sig[[0, 5]]
    interaction_dataframe['species'] = [x.split('_')[-1] for x in interaction_dataframe[0]]
    interaction_dataframe.head()
    pairs = interaction_dataframe.merge(species_dataframe, left_on='species', right_on='species_name', how='left')
    pairs.drop('species', axis = 1, inplace = True)
    pairs.drop('species_name', axis =1, inplace = True)
    pairs.columns = ['source', 'target', 'species_id']
    #View Dataframe
    #No need to drop duplicates (no duplicates in this dataframe)
    #Remove indication of species from name of the source
    pairs['source'] = [x.split('_')[:-1] for x in interaction_dataframe[0]]
    pairs['source'] = ['_'.join(x) for x in pairs['source']]
    
    # Need to lowercase and title names of mouse kinases to later
    #prevent id-ing of these kinases to the human equivalent
    for index,rowData in pairs.iterrows():
        if rowData.species_id == 2:
            pairs.source[index] = rowData.source.lower().title()
    pairs.drop_duplicates(inplace=True)
    
    #Create separate dataframes for the source and target genes
    source_genes = pd.DataFrame(dict(gene_symbol = pairs.source, species_fk = pairs.species_id))
    source_genes.drop_duplicates(inplace=True)
    source_genes.reset_index(inplace=True, drop=True)
    target_genes = pd.DataFrame(dict(gene_symbol = pairs.target, species_fk = pairs.species_id))
    target_genes.drop_duplicates(inplace = True)
    target_genes.reset_index(inplace=True, drop=True)
    
    #Concat these genes into a single dataframe
    genes = pd.concat([source_genes, target_genes])
    genes.drop_duplicates(inplace=True)
    
    #Use INSERT_IGNORE to add these genes and targets to the dataframe
    insert_ignore = "INSERT IGNORE INTO genes (gene_symbol, species_fk) VALUES" + ', '.join(['("{gene_symbol}", {species_fk})'.format(**rowData) for index, rowData in genes.iterrows()])
    # Create or add new entries to the genes table using 'insert-ignore' functionality
    engine.execute(insert_ignore)
    genes_df = pd.read_sql_query('SELECT * FROM genes', engine)
    
    #Use genes table to isolate fk of these source and target genes
    #fk will later be used when creating the interaction database
    source_fk = pairs.merge(genes_df, left_on='source', right_on='gene_symbol', how='left')
    source_fk.drop_duplicates(['source', 'target', 'species_id'], inplace = True)
    source_fk.drop(['source', 'target', 'species_id', 'species_fk',
                           'gene_symbol', 'description'], axis=1, inplace=True)
    target_fk = pairs.merge(genes_df, left_on='target', right_on='gene_symbol', how='left')
    target_fk.drop_duplicates(['source','target'], inplace = True)
    target_fk.drop(['source', 'target', 'species_id', 
                           'gene_symbol', 'species_fk', 'description'], axis=1, inplace=True)
       


In [9]:
SIG_to_Genes(sig)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self

In [10]:
resources = pd.read_excel("resources_file.xlsm")

In [11]:
resources.head()

Unnamed: 0,db_name,db_desc,db_url
0,Phospho.ELM,,http://phospho.elm.eu.org/index.html
1,PhosphoSite,,http://www.phosphosite.org/homeAction.action
2,RegPhos,,http://140.138.144.141/~RegPhos/
3,NetworKIN,,http://networkin.info/
4,HPRD,,http://hprd.org/


In [12]:
#ADD SIG_to_GMT conversion for kinases (can be reused for TF-target later)

In [17]:
def SIG_to_GMT(sig):
    #convert a SIG file to a GMT file
    #Create two versions of df 'd1' --> one flipped and one which is in original order
    d1 = {'kinase': sig[0], 'protein': sig[5]}
    df = pd.DataFrame(data = d1)
    df.drop_duplicates(inplace = True)

    df.set_index('kinase', inplace = True)

    gmt = df.groupby('kinase').agg(lambda x: tuple(x))

    # Create column representing counts of protein interactions per protein
    gmt['interactions'] = [int(len(lst)) for protein, lst in gmt['protein'].iteritems()]

    # Sort proteins from max to min according to number of protein iteractions
    gmt.sort_values(by = ['interactions'], ascending= False, inplace=True)

    #Gain indexes of gene sets with 5 or more proteins and less than 2000 protein interactions
    indices = [index for index, rowData in gmt.interactions.iteritems() if rowData >= 5 and rowData < 2000]

    #filter dataframe by these indices
    gmt_2 = gmt.loc[indices]

    #reset index and insert a column for the description
    gmt_2.insert(0, 'Description', 'No Description')
    gmt_2.reset_index(inplace = True)

    #Drop columns not needed in GMT and join all protein interactions by a tab
    gmt_2['merged'] = ['\t'.join(x) for x in gmt_2['protein']]
    gmt_2.drop('protein', axis = 1, inplace = True)
    gmt_2.drop('interactions', axis = 1, inplace = True)

    #create a dictionary and store in it rowData corresponding to each protein
    gmt_d = dict([(key, '') for key in gmt_2.index])

    # loop through rows with iterrows()
    for index, rowData in gmt_2.iterrows():
        line = ('\t'.join(rowData))
        gmt_d[index] = line
        
    #Transfer tab-separated info into a new gmt file
    with open('%s_kinase.gmt' %sig, 'w') as openfile:
        for index in gmt_d:
            openfile.write(str(gmt_d[index]) + '\n')

In [15]:
#ADD SIG_to_GMT conversion for PPI (must be different due to
#non-directional interaction)

In [16]:
def SIG_to_GMT_ppi(sig):
    #convert a SIG file to a GMT file
    #Create two versions of df 'd1' --> one flipped and one which is in original order
    d1 = {'protein_1': sig[0], 'protein_2': sig[5]}
    df = pd.DataFrame(data = d1)
    df.drop_duplicates(inplace = True)

    d2 = {'protein_1': sig[5], 'protein_2': sig[0]}
    df_flipped = pd.DataFrame(data = d2)
    df_flipped.drop_duplicates(inplace = True)

    #Make 'df' and 'df_flipped' into a single frame to account for opposite interactions
    extend = pd.concat([df, df_flipped])

    #Drop any duplicates and set index according to protein 1 so that we can aggregate 
    #all interacting protein 2's by a shared interaction with protein one
    extend.drop_duplicates(inplace = True)
    extend.set_index('protein_1', inplace = True)

    gmt = extend.groupby('protein_1').agg(lambda x: tuple(x))

    # Create column representing counts of protein interactions per protein
    gmt['interactions'] = [int(len(lst)) for protein, lst in gmt['protein_2'].iteritems()]

    # Sort proteins from max to min according to number of protein iteractions
    gmt.sort_values(by = ['interactions'], ascending= False, inplace=True)

    #Gain indexes of gene sets with 5 or more proteins and less than 2000 protein interactions
    indices = [index for index, rowData in gmt.interactions.iteritems() if rowData >= 5 and rowData < 2000]

    #filter dataframe by these indices
    gmt_2 = gmt.loc[indices]

    #reset index and insert a column for the description
    gmt_2.insert(0, 'Description', 'No Description')
    gmt_2.reset_index(inplace = True)

    #Drop columns not needed in GMT and join all protein interactions by a tab
    gmt_2['merged'] = ['\t'.join(x) for x in gmt_2['protein_2']]
    gmt_2.drop('protein_2', axis = 1, inplace = True)
    gmt_2.drop('interactions', axis = 1, inplace = True)

    #create a dictionary and store in it rowData corresponding to each protein
    gmt_d = dict([(key, '') for key in gmt_2.index])

    # loop through rows with iterrows()
    for index, rowData in gmt_2.iterrows():
        line = ('\t'.join(rowData))
        gmt_d[index] = line
        
    #Transfer tab-separated info into a new gmt file
    with open('%s_ppi.gmt' %sig, 'w') as openfile:
        for index in gmt_d:
            openfile.write(str(gmt_d[index]) + '\n')