In [15]:
import pandas as pd
import networkx as nx
import igraph as ig
import matplotlib.pyplot as plt
from matplotlib import pylab

df = pd.read_excel("input_preprocessed.xlsx")
df.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,89,90,91,92,93,94,95,96,97,98
0,cellular triglyceride homeostasis [GO:0035356],XBP1,FITM2,SIRT1,DGAT2,NR1H4,FUNDC2,C1QTNF3,,,...,,,,,,,,,,
1,urea transport [GO:0015840],UPK3A,AQP8,SLC14A1,SLC14A2,,,,,,...,,,,,,,,,,
2,response to tetrachloromethane [GO:1904772],IGF2R,NQO1,CPT1A,EZH2,,,,,,...,,,,,,,,,,
3,positive regulation of protein kinase C signal...,PLA2G6,WNT11,VEGFA,CD40,ADRA1A,FLT4,WNT5A,MC1R,ADGRV1,...,,,,,,,,,,
4,regulation of mitotic spindle assembly [GO:190...,CHMP2A,VPS4B,HSPA1A,HSPA1B,TPR,PLK1,HNRNPU,EML3,CCSAP,...,,,,,,,,,,


In [16]:
def construct_graph(df):
    '''
    Constructs the gene-gene interaction graph

    Parameters:
        df (pandas dataframe): The dataframe containing the the biological processes with genes between 4 and 100 and their corresponding genes

    Returns:
        G (iGraph graph): The graph representation of the gene-gene interaction network
    '''

    # Create an empty NetworkX graph
    G = nx.Graph()
    # Loop over each row in the DataFrame
    for i, row in df.astype(str).iterrows():
        nodes = row.iloc[1:].to_list()
        nodes = list(filter(lambda a: a != 'nan', nodes))

        for node in nodes:
            for node_2 in nodes:
                if node != node_2:
                    if not G.has_edge(node, node_2):
                        G.add_edge(node, node_2, weight=1)
                    else:
                        G[node][node_2]['weight'] += 1
    G = ig.Graph.from_networkx(G, vertex_attr_hashable="name")
    return G

In [17]:
G = construct_graph(df)

print(G.vcount())
print(G.ecount())

14567
1140873


In [None]:
G.save("gene_gene_interaction_graph.graphml", format="graphml")