# Imports

In [2]:
import numpy as np
import pandas as pd
from IPython.display import display
import graph_tool.all as gt

# Directories

In [4]:
rawdata = "data/raw/"
processeddata = "data/processed/"

# Pathway Analysis

## Load Data

In [5]:
signalling = (
    pd.read_csv(processeddata+"omnipath_graph.csv")
    .rename(columns={
      f"{pd.read_csv(processeddata+'omnipath_graph.csv').columns[0]}": "source",
      f"{pd.read_csv(processeddata+'omnipath_graph.csv').columns[1]}": "target"
    })
    # simplify the graph by removing self-loops
    [lambda x: x.source != x.target]
 )
tf_reg = pd.read_csv(processeddata+"dorothea_graph.csv")
print("signalling network:", len(signalling))
display(signalling.head())
print("TF-regulatory network:", len(tf_reg))
tf_reg.head()

signalling network: 106684


Unnamed: 0,source,target
0,CALM1,TRPC1
1,CALM3,TRPC1
2,CALM2,TRPC1
3,CAV1,TRPC1
4,DRD2,TRPC1


TF-regulatory network: 47306


Unnamed: 0,TF,gene
0,MYC,TERT
1,SMAD3,JUN
2,SMAD4,JUN
3,RELA,FAS
4,SP1,ALDOA


In [6]:
main_graph = (
    pd.read_csv(processeddata+"main_graph.csv")
    # add column indicating that the driver is a TF of its neighbour
    .merge(tf_reg, left_on=["driver", "neighbour"], right_on=["TF", "gene"], how="left")
    .assign(distance=lambda x: x.TF.notna().astype(int))
    .drop(columns=["TF"])
    .drop(columns=["gene"])
)
drivers = main_graph.driver.unique()
neighbours = main_graph.neighbour.unique()
tf_list = tf_reg.loc[tf_reg.gene.isin(neighbours), "TF"].unique()
print(len(main_graph))
main_graph.head()

456493


Unnamed: 0,driver,neighbour,distance
0,A1CF,APOB,0
1,A1CF,APOBEC1,0
2,A1CF,APOBEC2,0
3,A1CF,APOBEC3A,0
4,A1CF,APOBEC3B,0


In [7]:
print("# drivers:", len(drivers))
print("# Drivers in signalling network (outgoing edges):",
     len(set(signalling.source) & set(drivers)))
print("# neighbours:", len(neighbours))
print("# Neighbours regulated by a TF:", len(set(tf_reg.gene) & set(neighbours)))
print("# TFs regulating neighbours:", len(tf_list))
print("# TF-gene interactions in main graph:", main_graph.distance.sum())
print("Median TF degree:", tf_reg.groupby("TF").size().median())
print("Max TF degree:", tf_reg.groupby("TF").size()[lambda x: x==x.max()].to_dict())

# drivers: 3081
# Drivers in signalling network (outgoing edges): 1756
# neighbours: 15465
# Neighbours regulated by a TF: 10079
# TFs regulating neighbours: 624
# TF-gene interactions in main graph: 3283
Median TF degree: 10.0
Max TF degree: {'CTCF': 2266}


## Network Construction

In [8]:
node_list = (
    pd.concat([
      signalling.source.drop_duplicates().rename("name"),
      signalling.target.drop_duplicates().rename("name")
    ], ignore_index=True)
    .drop_duplicates()
    .sort_values()
    .reset_index(drop=True)
    .rename_axis("id")
    .reset_index(name="symbol")
    .assign(driver=lambda x: x.symbol.isin(drivers).astype(bool))
    .assign(TF=lambda x: x.symbol.isin(tf_list).astype(bool))
)
edge_list = (
    signalling
    .merge(node_list, left_on="source", right_on="symbol", suffixes=("_source", "_target"),)
    .drop(columns="symbol")
    .merge(node_list, left_on="target", right_on="symbol", suffixes=("_source", "_target"),)
    .drop(columns="symbol")
    [["source", "target", "id_source", "id_target"]]
)
display(node_list.head(2))
display(edge_list.head(2))

Unnamed: 0,id,symbol,driver,TF
0,0,A1BG,False,False
1,1,A2M,False,False


Unnamed: 0,source,target,id_source,id_target
0,CALM1,TRPC1,1068,9628
1,CALM3,TRPC1,1070,9628


In [9]:
print("# Nodes:", len(node_list))
print("# Edges:", len(edge_list))
print("# Drivers:", node_list.driver.sum())
print("Driver %:", node_list.driver.sum() / len(drivers) * 100)
print("# TFs regulating neighbours in signalling network:", node_list.TF.sum())
print("TFs %:", node_list.TF.sum() / len(tf_list) * 100)

# Nodes: 10408
# Edges: 106684
# Drivers: 2256
Driver %: 73.22297955209348
# TFs regulating neighbours in signalling network: 547
TFs %: 87.66025641025641


## Shortest Path Calculation

In [10]:
graph = gt.Graph(
  g=edge_list[["id_source", "id_target"]].to_numpy(),
  directed=True,
)
distances = gt.shortest_distance(graph).get_2d_array()
print(distances.shape)

(10408, 10408)


The array of distances is a 2D array where each column represents the vector of distances for each other node. See [documentation](https://graph-tool.skewed.de/static/docs/stable/autosummary/graph_tool.VertexPropertyMap.html#graph_tool.VertexPropertyMap.get_2d_array).

In [11]:
column = "driver"
index = "TF"
distances_results = (
    pd.DataFrame(
        data=distances[np.ix_(node_list.TF, node_list.driver)],
        columns=node_list.loc[node_list.driver, "symbol"],
        index=node_list.loc[node_list.TF, "symbol"]
    )
    .rename_axis(index, axis=0)
    .rename_axis(column, axis=1)
    .reset_index()
    .melt(id_vars=index, var_name=column, value_name="distance")
    # max distance means no shortest path found
    .assign(
        distance=lambda x: x.distance.where(x.distance != x.distance.max())
    )
    # drop rows with no shortest path found
    .dropna()
    .merge(tf_reg[tf_reg.gene.isin(neighbours)], on="TF", how="inner")
    .rename(columns={"gene": "neighbour"})
    # keep only paths where the driver connects to a TF of its neighbour
    # this greatly reduces the number of paths
    .merge(main_graph[["driver", "neighbour"]], how="inner")
    # pivot the table to have distances for each driver-TF pair
    .pivot_table(
        index=["driver", "neighbour"],
        columns="TF",
        values="distance",
    )
    # calculate the minimum distance for each driver-TF pair
    .assign(min_distance=lambda x: x.min(axis=1))
    .reset_index()
    .set_index(["driver", "neighbour", "min_distance"])
    # stack the distances to have a long format
    .stack()
    .reset_index(level="min_distance", name="distance")
    # filter the distances to keep only the minimum distance for each driver-TF pair
    [lambda x: x.distance == x.min_distance]
    .reset_index()
    .drop(columns=["min_distance"])
    .groupby(["driver", "neighbour", "distance"], as_index=False)
    # concatenate the TFs with shortest path to neighbour for each driver-neighbour pair
    # We will have 1 row per driver-neighbour pair
    .agg(lambda x: x.str.cat(sep=","))
    # 0 values indicate that the driver is a TF of its neighbour
    # So their shortest path is 1
    .assign(distance=lambda x: x.distance + 1)
    # add drivers which are TFs of their neighbours but where
    # not present in the signalling network
    .merge(main_graph, how="outer")
    # remove interactions without a shortest path
    [lambda x: x.distance > 0]
)
print(len(distances_results))
distances_results.head()

270378


Unnamed: 0,driver,neighbour,distance,TF
166,ABCA7,ADRB2,5.0,"SPI1,TP53"
169,ABCA7,APOA1,5.0,"CEBPA,E2F4,EGR1,ESR1,GATA6,HNF4A,PPARA,PPARG,R..."
173,ABCA7,HNRNPD,5.0,"E2F4,ETS1,MITF,MYC"
176,ABCA7,LLGL2,5.0,"ESR1,SOX9"
179,ABCA7,SNX27,5.0,"CEBPA,HNF4A"


In [12]:
print(distances_results[distances_results.TF.isna()].shape[0])
fill_TF = distances_results.driver
distances_results = distances_results.fillna({"TF": fill_TF})
print(distances_results[distances_results.TF.isna()].shape[0])
distances_results.info()

1
0
<class 'pandas.core.frame.DataFrame'>
Index: 270378 entries, 166 to 723586
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   driver     270378 non-null  object 
 1   neighbour  270378 non-null  object 
 2   distance   270378 non-null  float64
 3   TF         270378 non-null  object 
dtypes: float64(1), object(3)
memory usage: 10.3+ MB


In [13]:
neighbours_with_TF = tf_reg.loc[
    (tf_reg.TF.isin(node_list.symbol)) &
    (tf_reg.gene.isin(neighbours)),
    "gene"]
distances_results = (
    main_graph
    .assign(
      driver_present=lambda x: x.driver.isin(node_list.symbol),
      neighbour_present=lambda x: x.neighbour.isin(neighbours_with_TF)
    )
    [["driver", "neighbour", "driver_present", "neighbour_present"]]
    .merge(distances_results, on=["driver", "neighbour"], how="left")
)
print(distances_results.shape[0])
distances_results.head()

456493


Unnamed: 0,driver,neighbour,driver_present,neighbour_present,distance,TF
0,A1CF,APOB,False,True,,
1,A1CF,APOBEC1,False,True,,
2,A1CF,APOBEC2,False,False,,
3,A1CF,APOBEC3A,False,False,,
4,A1CF,APOBEC3B,False,False,,


In [14]:
# Add column with the number of shortest paths for each driver-neighbour pair
distances_results["n_shortest_paths"] = distances_results["TF"].str.split(",").str.len()
distances_results.loc[
  (distances_results.driver_present) &
  (distances_results.neighbour_present) &
  (distances_results.distance.isna()), "n_shortest_paths"] = 0
distances_results.dropna().head()

Unnamed: 0,driver,neighbour,driver_present,neighbour_present,distance,TF,n_shortest_paths
165,ABCA7,ADRB2,True,True,5.0,"SPI1,TP53",2.0
167,ABCA7,APOA1,True,True,5.0,"CEBPA,E2F4,EGR1,ESR1,GATA6,HNF4A,PPARA,PPARG,R...",10.0
170,ABCA7,HNRNPD,True,True,5.0,"E2F4,ETS1,MITF,MYC",4.0
172,ABCA7,LLGL2,True,True,5.0,"ESR1,SOX9",2.0
174,ABCA7,SNX27,True,True,5.0,"CEBPA,HNF4A",2.0


In [15]:
print("# driver-neighbour pairs:", len(distances_results))
print("# shortest paths:", len(distances_results.dropna(subset=["distance"])))
print("# pairs in graph without shortest path:",
      len(distances_results[distances_results.n_shortest_paths == 0]))
print("# of driver-neighbour pairs evaluated:",
      len(distances_results[distances_results.n_shortest_paths >= 0]))

# driver-neighbour pairs: 456493
# shortest paths: 270378
# pairs in graph without shortest path: 54969
# of driver-neighbour pairs evaluated: 325347


In [16]:
distances_results.to_csv(processeddata+"full_pathway_analysis.csv", index=False)