# SIG to GMT Conversion
This notebook will process .sig files related to protein-protein interactions into the alternate file format of a .gmt file. Date processed: July 6 2017

## Import Scripts Necessary to Run this Notebook

In [26]:
%run /home/maayanlab/Desktop/Projects/KEA3/Scripts/init.ipy

## Read in .SIG files corresponding to Database
View each SIG file to ensure everything was downloaded in proper file format.

In [27]:
humap_sig = pd.read_table("huMAP.sig", header = None, sep = ' ')

In [28]:
humap_sig.head(30)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,DBN1,,,,,ACTG1,,,,,0,Unknown,12345678
1,EIF2S1,,,,,EBNA1BP2,,,,,0,Unknown,12345678
2,HNRNPU,,,,,HSP90AB1,,,,,0,Unknown,12345678
3,GIGYF1,,,,,ZNF140,,,,,0,Unknown,12345678
4,TRAC,,,,,AK6,,,,,0,Unknown,12345678
5,TEAD1,,,,,VGLL4,,,,,0,Unknown,12345678
6,CERCAM,,,,,KDELC1,,,,,0,Unknown,12345678
7,TPR,,,,,PRPF40A,,,,,0,Unknown,12345678
8,SYN2,,,,,VSNL1,,,,,0,Unknown,12345678
9,PSMA8,,,,,PSMD14,,,,,0,Unknown,12345678


In [29]:
#convert a SIG file to a GMT file
#Create two versions of df 'd1' --> one flipped and one which is in original order
d1 = {'protein_1': humap_sig[0], 'protein_2': humap_sig[5]}
df = pd.DataFrame(data = d1)
df.drop_duplicates(inplace = True)

d2 = {'protein_1': humap_sig[5], 'protein_2': humap_sig[0]}
df_flipped = pd.DataFrame(data = d2)
df_flipped.drop_duplicates(inplace = True)

#Make 'df' and 'df_flipped' into a single frame to account for opposite interactions
extend = pd.concat([df, df_flipped])

#Drop any duplicates and set index according to protein 1 so that we can aggregate 
#all interacting protein 2's by a shared interaction with protein one
extend.drop_duplicates(inplace = True)
extend.set_index('protein_1', inplace = True)

gmt = extend.groupby('protein_1').agg(lambda x: tuple(x))

# Create column representing counts of protein interactions per protein
gmt['interactions'] = [int(len(lst)) for protein, lst in gmt['protein_2'].iteritems()]

# Sort proteins from max to min according to number of protein iteractions
gmt.sort_values(by = ['interactions'], ascending= False, inplace=True)

#Gain indexes of gene sets with 5 or more proteins and less than 2000 protein interactions
indices = [index for index, rowData in gmt.interactions.iteritems() if rowData >= 5 and rowData < 2000]

#filter dataframe by these indices
gmt = gmt.loc[indices]

gmt_2 = gmt.copy()

#reset index and insert a column for the description
gmt_2.insert(0, 'Description', 'No Description')
gmt_2.reset_index(inplace = True)

#Drop columns not needed in GMT and join all protein interactions by a tab
gmt_2['merged'] = ['\t'.join(x) for x in gmt_2['protein_2']]
gmt_2.drop('protein_2', axis = 1, inplace = True)
gmt_2.drop('interactions', axis = 1, inplace = True)

#create a dictionary and store in it rowData corresponding to each protein
gmt_d = dict([(key, '') for key in gmt_2.index])

# loop through rows with iterrows()
for index, rowData in gmt_2.iterrows():
    line = ('\t'.join(rowData))
    gmt_d[index] = line

In [30]:
gmt_d

{0: 'DDX47\tNo Description\tEMG1\tRPL3\tSLC25A4\tRPL21\tRPL27A\tLARP7\tRIOX2\tPAPD5\tSTAU2\tC1orf35\tUTP18\tRSL24D1\tRPS2\tPRDM15\tDHX30\tMRTO4\tGNL2\tGPATCH4\tMTHFD1\tGLYR1\tZC3H8\tRSBN1L\tSRPK1\tMPG\tIGF2BP1\tRRBP1\tPPAN-P2RY11\tHSP90AA1\tRPS13\tRPS9\tDDX27\tNCL\tSRP68\tRPS17\tPA2G4\tAVEN\tDDX24\tRPL19\tRACK1\tNOP58\tRPS27\tH1F0\tSLC25A5\tNOP16\tHNRNPU\tRPL5\tUTP23\tRPL22\tYBX3\tRPL36\tPURA\tDRG1\tRPL22L1\tRPL35A\tRPL14\tRTCB\tPOP1\tSPOUT1\tDDX31\tNPM3\tRPL7A\tZBTB24\tRBM19\tRBMX2\tCEBPZ\tRPS16\tRPS29\tZNF668\tLARP1\tRPL9\tRPS12\tRPL10A\tRBM28\tSERBP1\tUTP20\tCTCF\tZNF622\tUPF1\tRPL39\tHIST1H1B\tRPS24\tBTF3\tZNF574\tEIF3B\tNPM1\tBAZ1B\tRPL12\tZNF22\tZCCHC9\tZNF770\tEIF1AX\tNOP53\tDHX29\tRPL31\tHADHA\tABT1\tTSR1\tXRN2\tUSP10\tEIF3E\tH2AFX\tRPL29\tRPL3L\tNOLC1\tATP5C1\tRPL17\tNAT10\tSURF6\tKRI1\tWDR12\tYTHDC2\tMAK16\tRPL24\tRPS23\tZNF629\tRPS15\tYBX1\tRPL34\tDDX10\tMYEF2\tGZF1\tIFRD2\tSSB\tUBA52\tRPL7L1\tBYSL\tZC3H10\tDDX54\tC7orf50\tRPLP1\tNHP2\tHIST1H1D\tRPL7\tSPTY2D1\tNIFK\tFBL\tLAR

In [31]:
#Transfer tab-separated info into a new gmt file
with open('huMAP_ppi.gmt', 'w') as openfile:
    for index in gmt_d:
        openfile.write(gmt_d[index] + '\n')

In [32]:
genes_term = gmt.interactions

len(genes_term)

3959

In [33]:
avg_num_terms = genes_term.mean(axis = 0)
avg_num_terms

29.326092447587776

In [34]:
stat_df = extend.loc[indices]
len(stat_df.protein_2.unique())

5231

In [35]:
#For inclusion on website as a statistic, calculate 
#the total number of unique terms for the dataset
stat_df.reset_index(inplace=True)

all_terms = pd.concat([stat_df.protein_1, stat_df.protein_2], axis = 0)
len(all_terms.unique())

5251