# SIG to GMT Conversion
This notebook will process .sig files related to protein-protein interactions into the alternate file format of a .gmt file. Date processed: July 6 2017

## Import Scripts Necessary to Run this Notebook

In [1]:
%run /home/maayanlab/Desktop/Projects/KEA3/Scripts/init.ipy

## Read in .SIG files corresponding to Database
View each SIG file to ensure everything was downloaded in proper file format.

In [26]:
KEGG_sig = pd.read_table("KEGGsig.txt", header = None, sep = ' ')

In [27]:
KEGG_sig.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,SRF,,,,,ELK1,,,,,0,binding,10592173
1,SRF,,,,,ELK4,,,,,0,binding,10592173
2,MKNK2,,,,,ATF4,,,,,0,phosphorylation,10592173
3,MKNK1,,,,,ATF4,,,,,0,phosphorylation,10592173
4,RPS6KA6,,,,,ATF4,,,,,0,phosphorylation,10592173


In [7]:
#Define function 'sig_to_gmt' that converts a SIG file to a GMT file
def sig_to_gmt(sig_df):
    #Create two versions of df 'd1' --> one flipped and one which is in original order
    d1 = {'protein_1': sig_df[0], 'protein_2': sig_df[5]}
    df = pd.DataFrame(data = d1)
    df.drop_duplicates(inplace = True)

    d2 = {'protein_1': sig_df[5], 'protein_2': sig_df[0]}
    df_flipped = pd.DataFrame(data = d2)
    df_flipped.drop_duplicates(inplace = True)

    #Make 'df' and 'df_flipped' into a single frame to account for opposite interactions
    extend = pd.concat([df, df_flipped])

    #Drop any duplicates and set index according to protein 1 so that we can aggregate 
    #all interacting protein 2's by a shared interaction with protein one
    extend.drop_duplicates(inplace = True)
    extend.set_index('protein_1', inplace = True)

    gmt = extend.groupby('protein_1').agg(lambda x: tuple(x))

    # Create column representing counts of protein interactions per protein
    gmt['interactions'] = [int(len(lst)) for protein, lst in gmt['protein_2'].iteritems()]

    # Sort proteins from max to min according to number of protein iteractions
    gmt.sort_values(by = ['interactions'], ascending= False, inplace=True)

    #Gain indexes of gene sets with 5 or more proteins and less than 2000 protein interactions
    indices = [index for index, rowData in gmt.interactions.iteritems() if rowData >= 5 and rowData < 2000]

    #filter dataframe by these indices
    gmt = gmt.loc[indices]

    #reset index and insert a column for the description
    gmt.insert(0, 'Description', 'No Description')
    gmt.reset_index(inplace = True)

    #Drop columns not needed in GMT and join all protein interactions by a tab
    gmt['merged'] = ['\t'.join(x) for x in gmt['protein_2']]
    gmt.drop('protein_2', axis = 1, inplace = True)
    gmt.drop('interactions', axis = 1, inplace = True)

    #create a dictionary and store in it rowData corresponding to each protein
    gmt_d = dict([(key, '') for key in gmt.index])

    # loop through rows with iterrows()
    for index, rowData in gmt.iterrows():
        line = ('\t'.join(rowData))
        gmt_d[index] = line
    return gmt_d

In [28]:
#Use function to obtain dictionary of the data
KEGG_dict = sig_to_gmt(KEGG_sig)

#Transfer tab-separated info into a new gmt file
with open('KEGG_ppi.gmt', 'w') as openfile:
    for index in KEGG_dict:
        openfile.write(str(KEGG_dict[index]) + '\n')