# Creating a large network by combining the small ones

#### done:
Creating a network for each window with the most significant tf-gene connections that vary between branch A and branch B.

In [2]:
PATH_TO_DATAFRAMES='data_files/dataframes/'
PATH_TO_ABS_MATRICES='data_files/abs_networks/'
NUM_WINDOWS=5

In [3]:
import pandas as pd
import glob
import os
import networkx as nx
import matplotlib.pyplot as plt
from pyvis import network as net

In [4]:
def make_network(dataframe, window):
    G = nx.DiGraph()
    for i in range(len(dataframe['TF'])):
        tf = dataframe['TF'][i]
        gene = dataframe['GENE'][i]
        w = dataframe['abs'][i]
        G.add_edge(tf, gene, weight=w)
    edge_labels = nx.get_edge_attributes(G,'weight')
    for key in edge_labels.keys():
        edge_labels[key] = round(edge_labels[key],4)
    # pos = nx.kamada_kawai_layout(G) 
    # pos = nx.spring_layout(G, k=5)  # positions for all nodes
    # pos = nx.random_layout(G)
    # nx.draw_networkx(G, pos)
    # nx.draw_networkx_edge_labels(G, pos, edge_labels = edge_labels, rotate=False)

    nt = net.Network(notebook=True)
    nt.from_nx(G)
    nt.show("nx.html")
    
    
    #plt.savefig(f'test5_data_new/graphs2/win{window}_graph_5.png',bbox_inches='tight',dpi=100)

In [16]:
# a method to filter only the connections with largest difference to make the network clearer
def determine_cutoff(weights, num_top_values):
    weights_list = weights
    weights_list.sort(reverse=True)
    if num_top_values >= len(weights_list):
        cutoff = weights_list[len(weights_list)-1]
    else:
        cutoff = weights_list[num_top_values]
    return cutoff

In [7]:
# for all windows

In [8]:
branchA_csvs=glob.glob(f"{PATH_TO_DATAFRAMES}A_*")
branchA_csvs=[os.path.basename(csv) for csv in branchA_csvs]

branchB_csvs=glob.glob(f"{PATH_TO_DATAFRAMES}B_*.csv")
branchB_csvs=[os.path.basename(csv) for csv in branchB_csvs]

Read the information:

In [10]:
network_dfs = []

for k in range(NUM_WINDOWS):
    print(f'current window {k}')
    win_dataframe = pd.read_csv(f'{PATH_TO_ABS_MATRICES}win{k}_abs.csv')
    
    # dataframe = win_dataframe.sort_values(axis=0, by=['abs'], ascending=False)
    # dataframe = dataframe.head(50)
    # dataframe.reset_index(drop=True, inplace=True)
    
    #rename all TFs by adding _TF to name
    for i in range(len(win_dataframe['TF'])):
        win_dataframe['TF'].at[i] = f'{win_dataframe["TF"][i]}_TF'

    network_dfs.append(win_dataframe)
    # make_network(dataframe, k)

current window 0
current window 1
current window 2
current window 3
current window 4


In [17]:
network_dfs[4]


Unnamed: 0,TF,GENE,abs
0,Irf1_TF,0610030E20Rik,0.000000
1,Irf1_TF,1110004F10Rik,0.000000
2,Irf1_TF,1700020I14Rik,0.000000
3,Irf1_TF,1810037I17Rik,0.529617
4,Irf1_TF,1810058I24Rik,0.000000
...,...,...,...
123691,Ctcf_TF,Zmynd11,0.000000
123692,Ctcf_TF,Zmynd8,0.633632
123693,Ctcf_TF,Zranb2,0.000000
123694,Ctcf_TF,Zrsr2,0.000000


Create the graph:

In [21]:
NUM_CONNS = 50 # choose which connection determines the cutoff

G = nx.DiGraph()  

dataframe = network_dfs[1]
value_cutoff = determine_cutoff(list(dataframe['abs']), NUM_CONNS)
for i in range(len(dataframe)):
    tf = dataframe['TF'][i]
    gene = dataframe['GENE'][i]
    w = dataframe['abs'][i]
    if w > value_cutoff:
        G.add_node(tf, color='blue', group='TF')
        G.add_node(gene, color='lightblue', group='GENE')
        G.add_edge(tf, gene, weight=w, color='black')
    
    
dataframe = network_dfs[2]
value_cutoff = determine_cutoff(list(dataframe['abs']), NUM_CONNS)
for i in range(len(dataframe['TF'])):
    tf = dataframe['TF'][i]
    gene = dataframe['GENE'][i]
    w = dataframe['abs'][i]
    if w > value_cutoff:
        G.add_node(tf, color='green', group='TF')
        if not gene in G.nodes():
            G.add_node(gene, color='lightgreen', group='GENE')
        else:
            G.add_node(gene, color='grey', group='GENE')
        G.add_edge(tf, gene, weight=w, color='black')
    
        
dataframe = network_dfs[3]
value_cutoff = determine_cutoff(list(dataframe['abs']), NUM_CONNS)
for i in range(len(dataframe['TF'])):
    tf = dataframe['TF'][i]
    gene = dataframe['GENE'][i]
    w = dataframe['abs'][i]
    if w > value_cutoff:
        G.add_node(tf, color='yellowgreen', group='TF')
        if not gene in G.nodes():
            G.add_node(gene, color='yellow', group='GENE')
        else:
            G.add_node(gene, color='grey', group='GENE')
        G.add_edge(tf, gene, weight=w, color='black')


dataframe = network_dfs[4]
value_cutoff = determine_cutoff(list(dataframe['abs']), NUM_CONNS)
for i in range(len(dataframe['TF'])):
    tf = dataframe['TF'][i]
    gene = dataframe['GENE'][i]
    w = dataframe['abs'][i]
    if w > value_cutoff:
        G.add_node(tf, color='orangered', group='TF')
        if not gene in G.nodes():
            G.add_node(gene, color='orange', group='GENE')
        else:
            G.add_node(gene, color='grey', group='GENE')
        G.add_edge(tf, gene, weight=w, color='black')


edge_labels = nx.get_edge_attributes(G,'weight')


nt = net.Network(notebook=True)
nt.from_nx(G)

nt.show("nx.html")
    

#### Description:
blue: window 1 \
green: window 2 \
yellow: window 3 \
orange: window 4 \
grey: gene appears in at least two windows (connection between subnetworks)


#### done:
Created a large network for 5 windows showing the transcription factor - gene interactions that vary the most between branch A and branch B in each window. Because these genes are expressed highly different in the two branches they might be responsible for developing into two different cell types.