In [None]:
#Correlation Network Analysis

In [None]:
#Gene network analysis is a method designed to identify sub-networks (modules) of correlated genes, which are likely to be co-expressed.
#This can be helpful in identification of sub-networks (modules) of genes that contribute to disease.
#In this example, we will cover how to create a pairwise correlation matrix of genes, as well as how to associate them with disease.

In [None]:
import pandas as pd
import numpy as np
import scanpy as sc
import anndata
import networkx as nx
import scipy.cluster.hierarchy as sch
import matplotlib.pyplot as plt
import json
import scipy
from sklearn.decomposition import PCA
from scipy.stats import pearsonr
from scipy.spatial.distance import pdist, squareform
from statsmodels.stats.multitest import multipletests
import random

In [None]:
#Load in the required data
datExpr = pd.read_csv('/data/recode/mic_datExpr_pseudobulk.csv', index_col = 0)
metadata = pd.read_csv('/data/recode/mic_metadata_pseudobulk.csv', index_col = 0)

In [None]:
datExpr

In [None]:
metadata

In [None]:
#Correlation is a statistical measure that describes the extent to which two variables change together. It indicates the strength and direction of a linear relationship between two variables.
#Correlation analysis for coexpression networks is a method used to study the relationships between genes or proteins by analysing their expression levels across various conditions, tissues, or time points.
#We will be exploring correlation based co-expression networks within this exercise.

In [None]:
#Step 1: 
#You can use the corr function to calculate pairwise pearson correlations between the genes.
correlation_matrix = datExpr.corr()

In [None]:
#Lets view the correlation matrix. You will see that there is a diagonal of the value 1, due to the calculating the correlation between the same gene.
correlation_matrix

In [None]:
#Step 2:
#Now that we have the correlation matrix, we need to calculate the distance matrix.
#A distance matrix is a mathematical representation that captures the pairwise distances between a set of objects. 
#In hierarchical clustering, distance matrices are used to determine which objects to merge or split based on their pairwise distances.
#So, we will be using the distance matrix in order to calculate clusters between genes, which will form our networks. These are commonly also called communities.

# Perform hierarchical clustering using the dissimilarity values
distance_matrix = 1 - correlation_matrix  # Convert correlation to distance

#The linkage function is used to perform hierarchical clustering on the distance matrix. 
linkage_matrix = sch.linkage(distance_matrix, method='average')

# Plot the dendrogram
plt.figure(figsize=(10, 6))
dendrogram = sch.dendrogram(linkage_matrix)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample Index')
plt.ylabel('Distance')
plt.show()

In [None]:
#Step 3:
#Due to all of the vast correlations, the networks will be very messy each having lots of edges with varying correlation strengths.
#Not all of these are of interest, such as weak correlations. So we would solely like to focus on the strong correlations.

# Define threshold for significant edges (you can adjust this based on your requirement)
threshold = 0.7

# Threshold the correlation matrix to determine significant edges
significant_edges = correlation_matrix > threshold

In [None]:
#Lets have a look at the significant_edges dataframe.
#You can now see that this produces a boolean dataframe which states whether the values are greater than the threshold or not.
significant_edges

In [None]:
#Step 4:
#networkx is a python library designed for network analysis.
# Construct the network from significant edges.
G = nx.Graph()

# Loop through significant edges and add edges to the graph
for i in range(significant_edges.shape[0]):
    for j in range(significant_edges.shape[1]):
        if significant_edges.values[i, j]:
            gene1 = significant_edges.index[i]
            gene2 = significant_edges.columns[j]
            
            # Find the position of gene1 and gene2 in the correlation matrix
            i_corr = np.where(correlation_matrix.index == gene1)[0][0]
            j_corr = np.where(correlation_matrix.columns == gene2)[0][0]
            
            # Add the edge to the graph
            G.add_edge(gene1, gene2, weight=correlation_matrix.iloc[i_corr, j_corr])

# Display the number of nodes and edges in the graph
print("Number of nodes:", G.number_of_nodes())
print("Number of edges:", G.number_of_edges())

In [None]:
#Step 5:
# Analyse the network
# For example, you can identify clusters/modules using community detection algorithms
communities = nx.algorithms.community.greedy_modularity_communities(G)

In [None]:
communities

In [None]:
# Initialise a new graph for community visualisation
community_graph = nx.Graph()

In [None]:
# Add edges between communities based on their connectivity
for i, community in enumerate(communities):
    for node in community:
        community_graph.add_node(node)
        for neighbor in G.neighbors(node):
            if neighbor in community:
                community_graph.add_edge(node, neighbor, weight=G[node][neighbor]['weight'])


In [None]:
# Initialise an empty list to store separated communities
separated_communities = []

# Iterate over the detected communities
for community in communities:
    # Convert the set of nodes into a list and append it to the list of lists
    separated_communities.append(list(community))

In [None]:
# Print or use the separated communities as needed
print(separated_communities)

In [None]:
len(communities)

In [None]:
#Step 6:
# Choose the index of the community you want to visualize
community_index = 0  # Change this to the index of the community you want to visualise

# Get the nodes in the selected community
selected_community = list(communities[community_index])

# Create a subgraph containing only the nodes and edges within the selected community
subgraph = G.subgraph(selected_community)

In [None]:
# Assuming subgraph is already defined
num_nodes = subgraph.number_of_nodes()
print(f"The number of nodes in the subgraph is: {num_nodes}")

In [None]:
#Step 7:
# Visualise the subgraph
pos = nx.spring_layout(subgraph, k=0.8, iterations=20)  # You can use different layout algorithms if needed
plt.figure(figsize=(14, 12))
nx.draw(subgraph, pos, with_labels=True, node_color='skyblue', node_size=100, edge_color='gray', linewidths=0.1, font_size=8)
plt.title('Community Visualisation')
plt.show()

In [None]:
#Play around with different layouts and see how the visualisation is affected.
#For example, there is circular_layout. Check out https://networkx.org/documentation/stable/tutorial.html#drawing-graphs for more information.

In [None]:
# Visualise the subgraph
pos = nx.circular_layout(subgraph)  # You can use different layout algorithms if needed
plt.figure(figsize=(14, 12))
nx.draw(subgraph, pos, with_labels=True, node_color='skyblue', node_size=100, edge_color='gray', linewidths=0.1, font_size=8)
plt.title('Community Visualisation')
plt.show()
#As can be seen here, the circular layout is not really suitable due to the high number of genes within the sub-network

In [None]:
#Since the subnetwork itself is too large to visualise, we shall visualise a random sample of the subnetwork.
# Select 10 random nodes from the original subgraph
selected_nodes = random.sample(subgraph.nodes(), 10)

# Create a new subgraph containing only the selected nodes and their neighbors
reduced_subgraph = subgraph.subgraph(selected_nodes)

In [None]:
# Visualise the subgraph
pos = nx.circular_layout(reduced_subgraph)  # You can use different layout algorithms if needed
plt.figure(figsize=(14, 12))
nx.draw(reduced_subgraph, pos, with_labels=True, node_color='skyblue', node_size=100, edge_color='gray', linewidths=0.1, font_size=8)
plt.title('Community Visualisation')
plt.show()

In [None]:
#Try out a different visualisation technique with the randomly sampled subnetwork

In [None]:
# Visualise the subgraph
pos = nx.spectral_layout(reduced_subgraph)  # You can use different layout algorithms if needed
plt.figure(figsize=(14, 12))
nx.draw(reduced_subgraph, pos, with_labels=True, node_color='skyblue', node_size=100, edge_color='gray', linewidths=0.1, font_size=10)
plt.title('Community Visualisation')
plt.show()

In [None]:
#Exercise Questions
#1. What other correlation measures could be used?
#2. What other distance measures could be used??
#3. Are there other community-based algorithms that could be used instead?
#4. Show a worked through example testing different correlation measures, distance measures and community algorithms
#and see how this affects the networks produced.

In [None]:
#Answers:
#1. Spearman Rank Correlation: Measures the strength and direction of the monotonic relationship between two ranked variables. 
#Useful for ordinal data or when the relationship isn't linear.
#Kendall Tau Correlation: Measures the association between two ranked variables.
#Suitable for small sample sizes or data with many tied ranks.

#2. Euclidean Distance: The straight-line distance between two points in Euclidean space.
#Manhattan Distance: The sum of absolute differences between the coordinates of two points.
#Minkowski Distance: A generalisation that includes both Euclidean and Manhattan distances.
#By providing a comprehensive way to quantify relationships between objects, distance matrices play a fundamental role in data analysis, pattern recognition, and various scientific research fields.

#3. The Louvain method: Is an efficient algorithm for community detection that optimises modularity.
#The Girvan-Newman algorithm: Detects communities by progressively removing edges with the highest betweenness centrality.


In [None]:
#4

In [None]:
#Calculate correlation using Spearman Rank Correlation:
correlation_matrix2 = datExpr.corr(method = 'spearman')

In [None]:
correlation_matrix2

In [None]:
#Calculate distance matrix using Minkowski Distance

# Convert correlation matrix to distance matrix
# Here, we use 1 - correlation as the distance measure by first calculating the dissimilarity
distance_matrix2 = 1 - correlation_matrix2

# Calculate the Minkowski distance (p = 2 for Euclidean, p = 1 for Manhattan, etc.)
p = 3  # You can change this to any value of p
minkowski_distances = pdist(distance_matrix2, metric='minkowski', p=p)

# Convert back to square form
distance_matrix_minkowski = squareform(minkowski_distances)


In [None]:
distance_matrix_minkowski

In [None]:
# Convert back to DataFrame to keep row and column names
#In order to do that save the row and column names from the original correlation matrix first:

row_labels = correlation_matrix.index
column_labels = correlation_matrix.columns

#Convert the distance matrix into a dataframe
distance_df = pd.DataFrame(distance_matrix_minkowski, index=row_labels, columns=column_labels)

In [None]:
distance_df

In [None]:
#The linkage function is used to perform hierarchical clustering on the distance matrix. 
linkage_matrix2 = sch.linkage(distance_df, method='average')

# Plot the dendrogram
plt.figure(figsize=(10, 6))
dendrogram = sch.dendrogram(linkage_matrix2)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample Index')
plt.ylabel('Distance')
plt.show()

In [None]:
# Define threshold for significant edges (you can adjust this based on your requirement)
threshold = 0.7

# Threshold the correlation matrix to determine significant edges
significant_edges2 = correlation_matrix2 > threshold

In [None]:
significant_edges2

In [None]:
#networkx is a python library designed for network analysis.
# Construct the network from significant edges.
G2 = nx.Graph()

# Loop through significant edges and add edges to the graph
for i in range(significant_edges2.shape[0]):
    for j in range(significant_edges2.shape[1]):
        if significant_edges2.values[i, j]:
            gene1 = significant_edges2.index[i]
            gene2 = significant_edges2.columns[j]
            
            # Find the position of gene1 and gene2 in the correlation matrix
            i_corr = np.where(correlation_matrix2.index == gene1)[0][0]
            j_corr = np.where(correlation_matrix2.columns == gene2)[0][0]
            
            # Add the edge to the graph
            G2.add_edge(gene1, gene2, weight=correlation_matrix2.iloc[i_corr, j_corr])

# Display the number of nodes and edges in the graph
print("Number of nodes:", G2.number_of_nodes())
print("Number of edges:", G2.number_of_edges())

In [None]:
# Analyse the network
# For example, you can identify clusters/modules using community detection algorithms
#Connected_components is a quick and simple community detection algorithm. 
#While this method may not be as sophisticated as algorithms like Louvain or Girvan-Newman, it can provide a basic partitioning of the graph into communities.
communities2 = list(nx.connected_components(G2))

In [None]:
communities2

In [None]:
# Initialise a new graph for community visualisation
community_graph2 = nx.Graph()

In [None]:
# Add edges between communities based on their connectivity
for i, community in enumerate(communities2):
    for node in community:
        community_graph2.add_node(node)
        for neighbor in G2.neighbors(node):
            if neighbor in community:
                community_graph2.add_edge(node, neighbor, weight=G2[node][neighbor]['weight'])


In [None]:
# Initialise an empty list to store separated communities
separated_communities = []

# Iterate over the detected communities
for community in communities:
    # Convert the set of nodes into a list and append it to the list of lists
    separated_communities.append(list(community))

In [None]:
# Print or use the separated communities as needed
print(separated_communities)

In [None]:
len(communities2)

In [None]:
# Choose the index of the community you want to visualize
community_index2 = 0  # Change this to the index of the community you want to visualise

# Get the nodes in the selected community
selected_community2 = list(communities2[community_index2])

# Create a subgraph containing only the nodes and edges within the selected community
subgraph2 = G2.subgraph(selected_community2)

In [None]:
# Assuming subgraph is already defined
num_nodes = subgraph.number_of_nodes()
print(f"The number of nodes in the subgraph is: {num_nodes}")

In [None]:
#Step 7:
# Visualise the subgraph
pos = nx.spring_layout(subgraph2, k=0.8, iterations=20)  # You can use different layout algorithms if needed
plt.figure(figsize=(14, 12))
nx.draw(subgraph2, pos, with_labels=True, node_color='skyblue', node_size=100, edge_color='gray', linewidths=0.1, font_size=8)
plt.title('Community Visualisation')
plt.show()