# SIG to GMT Conversion
This notebook will process .sig files related to protein-protein interactions into the alternate file format of a .gmt file. Date processed: Jun 28 2017

## Import Scripts Necessary to Run this Notebook

In [1]:
%run /home/maayanlab/Desktop/Projects/KEA3/Scripts/init.ipy

## Read in .SIG files corresponding to Databases
View each SIG file to ensure everything was downloaded in proper file format.

In [2]:
colnames = [0, 1, 2, 3, 4, 5,6,7, 8, 9, 10, 11, 12]


biogrid_sig = pd.read_table("~/Desktop/Projects/KEA3/PPI/BioGrid/biogrid_ppi_2017_06_28.sig", header = None)

dip_sig = pd.read_table("~/Desktop/Projects/KEA3/PPI/DIP/dip_ppi_2017_06_28.sig", header = None)

innatedb_sig = pd.read_table("~/Desktop/Projects/KEA3/PPI/InnateDB/innatedb_ppi_2017_06_28.sig", header = None)

intact_sig = pd.read_table("~/Desktop/Projects/KEA3/PPI/IntAct/intact_ppi_2017_07_05.sig", header = None)

kea_sig = pd.read_table("~/Desktop/Projects/KEA3/PPI/KEA/kea_ppi_2017_06_28.sig", header = None)

mentha_sig = pd.read_table("~/Desktop/Projects/KEA3/PPI/mentha/mentha_ppi_2017_06_28.sig", header = None)

mint_sig = pd.read_table("~/Desktop/Projects/KEA3/PPI/MINT/mint_ppi_2017_06_28.sig", header = None)

savi_sig = pd.read_table("~/Desktop/Projects/KEA3/PPI/SAVI/savi_ppi_2017_06_28.sig", header = None)

humap_sig = pd.read_table("~/Desktop/Projects/KEA3/PPI/humap/huMAP.sig", header = None, names = colnames, sep = ' ')

kegg_sig = pd.read_table("~/Desktop/Projects/KEA3/PPI/KEGG/KEGG.sig", header = None, names = colnames, sep = ' ')

biocarta_sig = pd.read_table("~/Desktop/Projects/KEA3/PPI/BioCarta/Biocarta.sig", header = None, names = colnames, sep = ' ')

ppid_sig = pd.read_table("~/Desktop/Projects/KEA3/PPI/ppid/ppid.sig", header = None, names = colnames, sep = ' ')

bioplex_sig = pd.read_table("~/Desktop/Projects/KEA3/PPI/BioPlex_2017/BioPlexsig.txt", header = None, names = colnames, sep = '\t')

In [5]:
# View Resulting Dataframe
biogrid_sig.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,A1CF,,,,,APOBEC1,,,,,,,10669759|11134005
1,A1CF,,,,,SYNCRIP,,,,,,,11134005|11352648
2,A1CF,,,,,KHSRP,,,,,,,10781591
3,A2M,,,,,APOE,,,,,,,9831625
4,A2M,,,,,IL10,,,,,,,10714547


In [14]:
# View Resulting Dataframe
dip_sig.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,A2M,,,,,APP,,,,,,,9501253
1,ABCA1,,,,,SPTLC1,,,,,,,25170080
2,ABCA1,,,,,KIF11,,,,,,,25170080
3,ABCA1,,,,,FLNA,,,,,,,25170080
4,ABCC8,,,,,KCNJ11,,,,,,,19805355


In [None]:
# View Resulting Dataframe
innatedb_sig.head()

In [34]:
# View Resulting Dataframe
intact_sig.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,A2M,,,,,SPACA3,,,,,,,2467766
1,AANAT,,,,,YWHAZ,,,,,,,15644438
2,AATF,,,,,CHEK2,,,,,,,17157788|17157788
3,AATF,,,,,MAPT,,,,,,,14697667|14697667
4,AATF,,,,,POLR2J,,,,,,,10783144


In [None]:
# View Resulting Dataframe
kea_sig.head()

In [None]:
# View Resulting Dataframe
mentha_sig.head()

In [25]:
# View Resulting Dataframe
mint_sig.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,AANAT,,,,,YWHAZ,,,,,,,15644438
1,AATF,,,,,MAPT,,,,,,,14697667|14697667
2,AATF,,,,,POLR2J,,,,,,,10783144
3,AATF,,,,,RBBP8,,,,,,,10783144|10783144
4,AATF,,,,,SP1,,,,,,,12847090|12847090


In [None]:
# View Resulting Dataframe
savi_sig.head()

## Define function 'sig_to_gmt'

In [58]:
#convert a SIG file to a GMT file
#Create two versions of df 'd1' --> one flipped and one which is in original order
d1 = {'protein_1': ppid_sig[0], 'protein_2': ppid_sig[5]}
df = pd.DataFrame(data = d1)
df.drop_duplicates(inplace = True)

d2 = {'protein_1': ppid_sig[5], 'protein_2': ppid_sig[0]}
df_flipped = pd.DataFrame(data = d2)
df_flipped.drop_duplicates(inplace = True)

#Make 'df' and 'df_flipped' into a single frame to account for opposite interactions
extend = pd.concat([df, df_flipped])

#Drop any duplicates and set index according to protein 1 so that we can aggregate 
#all interacting protein 2's by a shared interaction with protein one
extend.drop_duplicates(inplace = True)
extend.set_index('protein_1', inplace = True)

gmt = extend.groupby('protein_1').agg(lambda x: tuple(x))

# Create column representing counts of protein interactions per protein
gmt['interactions'] = [int(len(lst)) for protein, lst in gmt['protein_2'].iteritems()]

# Sort proteins from max to min according to number of protein iteractions
gmt.sort_values(by = ['interactions'], ascending= False, inplace=True)

#Gain indexes of gene sets with 5 or more proteins and less than 2000 protein interactions
indices = [index for index, rowData in gmt.interactions.iteritems() if rowData >= 5 and rowData < 2000]

#filter dataframe by these indices
gmt = gmt.loc[indices]

gmt_2 = gmt.copy()

#reset index and insert a column for the description
gmt_2.insert(0, 'Description', 'No Description')
gmt_2.reset_index(inplace = True)

#Drop columns not needed in GMT and join all protein interactions by a tab
gmt_2['merged'] = ['\t'.join(x) for x in gmt_2['protein_2']]
gmt_2.drop('protein_2', axis = 1, inplace = True)
gmt_2.drop('interactions', axis = 1, inplace = True)

#create a dictionary and store in it rowData corresponding to each protein
gmt_d = dict([(key, '') for key in gmt_2.index])

# loop through rows with iterrows()
for index, rowData in gmt_2.iterrows():
    line = ('\t'.join(rowData))
    gmt_d[index] = line

In [59]:
genes_term = gmt.interactions

len(genes_term)

421

In [60]:
sum(genes_term)

4672

In [61]:
avg_num_terms = genes_term.mean(axis = 0)
avg_num_terms

11.097387173396674

In [62]:
#For inclusion on website as a statistic, calculate 
#the total number of unique terms for the dataset
stat_df = extend.loc[indices]

stat_df.reset_index(inplace=True)

all_terms = pd.concat([stat_df.protein_1, stat_df.protein_2], axis = 0)
len(all_terms.unique())

1115

# Create GMT Files for each SIG File

In [32]:
#Transfer tab-separated info into a new gmt file
#with open('biogrid_ppi.gmt', 'w') as openfile:
    for index in gmt_d:
        openfile.write(str(gmt_d[index]) + '\n')

IndentationError: unexpected indent (<ipython-input-32-faac3e32c088>, line 3)

In [28]:
#Transfer tab-separated info into a new gmt file
#with open('dip_ppi.gmt', 'w') as openfile:
    for index in gmt_d:
        openfile.write(str(gmt_d[index]) + '\n')

In [29]:
#Transfer tab-separated info into a new gmt file
#with open('mint_ppi.gmt', 'w') as openfile:
    for index in gmt_d:
        openfile.write(str(gmt_d[index]) + '\n')

In [36]:
#Transfer tab-separated info into a new gmt file
#with open('intact_ppi.gmt', 'w') as openfile:
    for index in gmt_d:
        openfile.write(str(gmt_d[index]) + '\n')

In [42]:
#Transfer tab-separated info into a new gmt file
#with open('innatedb_ppi.gmt', 'w') as openfile:
     for index in gmt_d:
        openfile.write(str(gmt_d[index]) + '\n')

In [48]:
#Transfer tab-separated info into a new gmt file
#with open('kea_ppi.gmt', 'w') as openfile:
    for index in gmt_d:
        openfile.write(str(gmt_d[index]) + '\n')

In [54]:
#Transfer tab-separated info into a new gmt file
#with open('mentha_ppi.gmt', 'w') as openfile:
    for index in gmt_d:
        openfile.write(str(gmt_d[index]) + '\n')

In [60]:
#Transfer tab-separated info into a new gmt file
#with open('savi_ppi.gmt', 'w') as openfile:
    for index in gmt_d:
        openfile.write(str(gmt_d[index]) + '\n')