# SIG to GMT Conversion
This notebook will process .sig files related to protein-protein interactions into the alternate file format of a .gmt file. Date processed: July 6 2017

## Import Scripts Necessary to Run this Notebook

In [2]:
%run /home/maayanlab/Desktop/Projects/KEA3/Scripts/init.ipy

## Read in .SIG files corresponding to Database
View each SIG file to ensure everything was downloaded in proper file format.

In [3]:
BioCarta_sig = pd.read_table("Biocartasig.txt", header = None, sep = ' ')

In [4]:
BioCarta_sig.head(30)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,MAP2K7,,,,,MAP2K7,,,,,0,Unknown,12345678
1,CORIN,,,,,NPPA,,,,,0,Unknown,12345678
2,AKT1,,,,,NOS3,,,,,0,Unknown,12345678
3,ANXA1,,,,,ANXA1,,,,,0,Unknown,12345678
4,GATA4,,,,,SRF,,,,,0,Unknown,12345678
5,SRF,,,,,HOP,,,,,0,Unknown,12345678
6,MAP2K7,,,,,MAP2K7,,,,,0,Unknown,12345678
7,F2,,,,,F2R,,,,,0,Unknown,12345678
8,F2,,,,,ATBF1,,,,,0,Unknown,12345678
9,WBSCR14,,,,,WBSCR14,,,,,0,Unknown,12345678


In [5]:
#convert a SIG file to a GMT file
#Create two versions of df 'd1' --> one flipped and one which is in original order
d1 = {'protein_1': BioCarta_sig[0], 'protein_2': BioCarta_sig[5]}
df = pd.DataFrame(data = d1)
df.drop_duplicates(inplace = True)

d2 = {'protein_1': BioCarta_sig[5], 'protein_2': BioCarta_sig[0]}
df_flipped = pd.DataFrame(data = d2)
df_flipped.drop_duplicates(inplace = True)

#Make 'df' and 'df_flipped' into a single frame to account for opposite interactions
extend = pd.concat([df, df_flipped])

#Drop any duplicates and set index according to protein 1 so that we can aggregate 
#all interacting protein 2's by a shared interaction with protein one
extend.drop_duplicates(inplace = True)
extend.set_index('protein_1', inplace = True)

gmt = extend.groupby('protein_1').agg(lambda x: tuple(x))

# Create column representing counts of protein interactions per protein
gmt['interactions'] = [int(len(lst)) for protein, lst in gmt['protein_2'].iteritems()]

# Sort proteins from max to min according to number of protein iteractions
gmt.sort_values(by = ['interactions'], ascending= False, inplace=True)

#Gain indexes of gene sets with 5 or more proteins and less than 2000 protein interactions
indices = [index for index, rowData in gmt.interactions.iteritems() if rowData >= 5 and rowData < 2000]

#filter dataframe by these indices
gmt = gmt.loc[indices]

gmt_2 = gmt.copy()

#reset index and insert a column for the description
gmt_2.insert(0, 'Description', 'BioCarta')
gmt_2.reset_index(inplace = True)

#Drop columns not needed in GMT and join all protein interactions by a tab
gmt_2['merged'] = ['\t'.join(x) for x in gmt_2['protein_2']]
gmt_2.drop('protein_2', axis = 1, inplace = True)
gmt_2.drop('interactions', axis = 1, inplace = True)

#create a dictionary and store in it rowData corresponding to each protein
gmt_d = dict([(key, '') for key in gmt_2.index])

# loop through rows with iterrows()
for index, rowData in gmt_2.iterrows():
    line = ('\t'.join(rowData))
    gmt_d[index] = line

In [6]:
#Transfer tab-separated info into a new gmt file
with open('BioCarta_ppi.gmt', 'w') as openfile:
    for index in gmt_d:
        openfile.write(str(gmt_d[index]) + '\n')

# Exploratory Data Analysis

In [7]:
genes_term = gmt.interactions

len(genes_term)

9

In [8]:
avg_num_terms = genes_term.mean(axis = 0)
avg_num_terms

6.444444444444445

In [9]:
stat_df = extend.loc[indices]
len(stat_df.protein_2.unique())

48

In [10]:
#For inclusion on website as a statistic, calculate 
#the total number of unique terms for the dataset
stat_df.reset_index(inplace=True)

all_terms = pd.concat([stat_df.protein_1, stat_df.protein_2], axis = 0)
len(all_terms.unique())

50