In [None]:
---
title: "Developing notebook"
format: html
---

In [2]:
import os, sys, re, csv
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import pandas as pd
from scipy import optimize as opt
from scipy.integrate import odeint
from scipy.interpolate import interp1d
from scipy.stats import pearsonr
import scipy.odr as odr
import networkx as nx
import seaborn as sns


Here we read the whole matrix of Ws and try to understand the distributions.

The dimension of the matrix is:


In [9]:
result_dir = "./cellbox/RNA/20230310_cyano_rna_results/seed_230309/"
evals = pd.read_csv(result_dir + 'record_eval.csv', index_col=False)

In [11]:
reps_info = evals[evals["epoch"] == -1]

Unnamed: 0,epoch,iter,train_loss,valid_loss,train_mse,valid_mse,test_mse,time_elapsed
6606,-1,,,34.5013427734375,,31.733522415161133,,0.639109
6607,-1,,,,,,75.5876235961914,0.634421
6944,-1,,,32.10624694824219,,31.722930908203125,,0.042356
6945,-1,,,,,,75.55860137939453,0.076163
7282,-1,,,27.228490829467773,,27.138607025146484,,0.049767
7283,-1,,,,,,73.33203887939453,0.076021
7620,-1,,,26.83790779113769,,26.812124252319336,,0.045986
7621,-1,,,,,,72.87110137939453,0.066816
8768,-1,,,47.781288146972656,,47.69268798828125,,1.166981
8769,-1,,,,,,81.4090805053711,1.231221


In [None]:
matrix_w = pd.read_csv(w_file, header=None)
print(matrix_w.shape)
# print(matrix_w.abs().max().max())
# print(matrix_w.abs().min().min())

In [None]:
df = matrix_w.copy()
df[df.abs() == 0] = np.nan
print(df.abs().max().max())
print(df.abs().min().min())
print(df.max().max())
print(df.min().min())


Ploting the histogram of all values:


In [None]:
#| label: fig_w_dist
#| fig-cap: "Distribution of Ws"
matrix_w.stack().plot.hist(bins=100);


Now if we set arbitrary thresholds say abs(w) > 0.1


In [None]:
#| label: fig_w_dist_0.1
#| fig-cap: "Distribution of Ws"
df = matrix_w.copy()
df[df.abs() < 0.1] = np.nan
df.stack().plot.hist(bins=100);
# df[df>0].stack().plot.hist(bins=100);
# df[df<0].stack().plot.hist(bins=100);


Now if we increaset to 0.5


In [None]:
#| label: fig_w_dist_0.5
#| fig-cap: "Distribution of Ws"
df = matrix_w.copy()
df[df.abs() < 0.5] = np.nan
df.stack().plot.hist(bins=100);
# df[df>0].stack().plot.hist(bins=100);
# df[df<0].stack().plot.hist(bins=100);


It seems the histogram can not reflect what are the values distritbute, here is what we can do:
1) transform all the values to absolute 
2) replace zeros with nan
3) log10 transform all the values
4) replot the histogram


In [None]:
#| label: fig_w_log10_dist
#| fig-cap: "Distribution of log10(abs(Ws))"
matrix_wlog10 = np.log10(matrix_w.abs().replace(0, np.nan))
matrix_wlog10.stack().plot.hist(bins=100);


Now based on the distribution of log10(abs(Ws)), we can fairly say that $10^{-3}$ is a good candidate for the cut-off. We take all values whose absotlute is smaller than $10^{-3}$ will be $0$ and considered as no interactions.

Now if we plot them with their actual values after filtering:


In [None]:
matrix_w_temp = matrix_wlog10.copy().replace(0, np.nan)
matrix_w_temp[matrix_w_temp <= -3] = np.nan
matrix_w_temp += 3.2
matrix_w_temp[matrix_w < 0] = - matrix_w_temp[matrix_w < 0]
matrix_w_temp.stack().plot.hist(bins=100).set_xticks([-3.2, -2.2, -1.2 , -0.2, 0.2, 1.2, 2.2, 3.2], ["-1", "-0.1", "-0.01", "-0.001", "0.001", "0.01", "0.1", "1"], rotation=60);


When transform back to linear scale it looks like:


In [None]:
matrix_w_new_log = matrix_wlog10.copy().replace(0, np.nan)
matrix_w_new_log[matrix_w_new_log <= -3] = np.nan
matrix_w_new_log += 3.0
matrix_w_new_log[matrix_w < 0] = - matrix_w_new_log[matrix_w < 0]

matrix_w_new = matrix_wlog10.copy().replace(0, np.nan)
matrix_w_new[matrix_w_new <= -3] = np.nan
matrix_w_new = 10 ** matrix_w_new
matrix_w_new[matrix_w < 0] = - matrix_w_new[matrix_w < 0]
matrix_w_new.stack().plot.hist(bins=100);


Now we construct the network to see the structure of model:

First, we plot the heatmaps of the interactions: 


In [None]:
#| label: fig_w_log10_heatmap
#| fig-cap: "Heatmap of Ws"
sns.heatmap(matrix_w_new_log, cmap="coolwarm", center=0, cbar_kws={'ticks': [-3, -2, -1, 0, 1, 2, 3]}).collections[0].colorbar.set_ticklabels([-1, -0.1, -0.01, 0, 0.01, 0.1, 1]);


If we color them using linear scale values:


In [None]:
#| label: fig_w_heatmap
#| fig-cap: "Heatmap of Ws in linear scale"
sns.heatmap(matrix_w_new, cmap="coolwarm", center=0);


And if we set the threshold as 0.01


In [None]:
#| label: fig_w_log10_heatmap_0.01
#| fig-cap: "Heatmap of Ws"
df = matrix_w_new_log.copy()
df[df.abs() < 1] = np.nan
sns.heatmap(df, cmap="coolwarm", center=0, cbar_kws={'ticks': [-3, -2, -1, 0, 1, 2, 3]}).collections[0].colorbar.set_ticklabels([-1, -0.1, -0.01, 0, 0.01, 0.1, 1]);


In [None]:
#| label: fig_w_heatmap_0.01
#| fig-cap: "Heatmap of Ws in linear scale"
df = matrix_w_new.copy()
df[df.abs() < 0.01] = np.nan
sns.heatmap(df, cmap="coolwarm", center=0);


What about 0.1?


In [None]:
#| label: fig_w_log10_heatmap_0.1
#| fig-cap: "Heatmap of Ws"
df = matrix_w_new_log.copy()
df[df.abs() < 2] = np.nan
sns.heatmap(df, cmap="coolwarm", center=0, cbar_kws={'ticks': [-3, -2, -1, 0, 1, 2, 3]}).collections[0].colorbar.set_ticklabels([-1, -0.1, -0.01, 0, 0.01, 0.1, 1]);


In [None]:
#| label: fig_w_heatmap_0.1
#| fig-cap: "Heatmap of Ws in linear scale"
df = matrix_w_new.copy()
df[df.abs() < 0.1] = np.nan
sns.heatmap(df, cmap="coolwarm", center=0);


In [None]:
print(matrix_w_new.abs().max().max())
print(matrix_w_new.abs().min().min())


The heatmap is showing the perturbations and phenotypes are connected with molecular signatures. And it is why there are missing values between the perturbations and phenotypes.

Next, we construct network to see what subnetworks are generated. 


In [None]:
df = matrix_w.copy()
df[df.abs() < 0.001] = 0
# df[df <= -0.001] = -1
# df[df >= 0.001] = 1
G = nx.from_numpy_array(df.to_numpy(), create_using=nx.DiGraph())
df001 = matrix_w.copy()
df001[df001.abs() < 0.01] = 0
# df001[df001 <= -0.01] = -1
# df001[df001 >= 0.01] = 1
G001 = nx.from_numpy_array(df001.to_numpy(), create_using=nx.DiGraph())
df01 = matrix_w.copy()
df01[df01.abs() < 0.1] = 0
# df01[df01 <= -0.1] = -1
# df01[df01 >= 0.1] = 1
G01 = nx.from_numpy_array(df01.to_numpy(), create_using=nx.DiGraph())
# df05 = matrix_w.copy()
# df05[df05.abs() < 0.5] = 0
# df05[df05.abs() <= -0.5] = -1
# df05[df05.abs() >= 0.5] = 1
# G05 = nx.from_numpy_array(df05.to_numpy(), create_using=nx.DiGraph())
# df09 = matrix_w.copy()
# df09[df09.abs() < 0.9] = 0
# df09[df09.abs() <= -0.9] = -1
# df09[df09.abs() >= 0.9] = 1
# G09 = nx.from_numpy_array(df09.to_numpy(), create_using=nx.DiGraph())
# df1 = matrix_w.copy()
# df1[df1.abs() < 1] = 0
# df1[df1.abs() <= -1] = -1
# df1[df1.abs() >= 1] = 1
# G1 = nx.from_numpy_array(df1.to_numpy(), create_using=nx.DiGraph())


Calculate the connected components with differently filtered networks.


In [None]:
G_comp = [len(c) for c in sorted(nx.strongly_connected_components(G), key=len, reverse=True)]
G01_comp = [len(c) for c in sorted(nx.strongly_connected_components(G01), key=len, reverse=True)]
G001_comp = [len(c) for c in sorted(nx.strongly_connected_components(G001), key=len, reverse=True)]
# G05_comp = [len(c) for c in sorted(nx.strongly_connected_components(G05), key=len, reverse=True)]
# G09_comp = [len(c) for c in sorted(nx.strongly_connected_components(G09), key=len, reverse=True)]
# G1_comp = [len(c) for c in sorted(nx.strongly_connected_components(G1), key=len, reverse=True)]
print("Network with 1e-3 threshold has", len(G_comp), " strongly connected components. And their sizes are:")
print(G_comp)
print("Network with 1e-2 threshold has", len(G001_comp), " strongly connected components. And their sizes are:")
print(G001_comp)
print("Network with 1e-1 threshold has", len(G01_comp), " strongly connected components. And their sizes are:")
print(G01_comp)
# print("Network with 5e-1 threshold has", len(G05_comp), " strongly connected components. And their sizes are:")
# print(G05_comp)
# print("Network with 9e-1 threshold has", len(G09_comp), " strongly connected components. And their sizes are:")
# print(G09_comp)
# print("Network with 1 threshold has", len(G1_comp), " strongly connected components. And their sizes are:")
# print(G1_comp)


##### Visualization of these graphs

TODO: Drop all zeros


In [None]:
nodes_num = 157


Networks of 0.001


In [None]:
df = matrix_w.copy()
df[df.abs() < 0.001] = 0
G = nx.from_numpy_array(df.iloc[:nodes_num, :nodes_num].to_numpy(), create_using=nx.DiGraph())
pos = nx.circular_layout(G)
nx.draw_circular(G)


Networks of 0.01


In [None]:
df = matrix_w.copy()
df[df.abs() < 0.01] = 0
G = nx.from_numpy_array(df.iloc[:nodes_num, :nodes_num].to_numpy(), create_using=nx.DiGraph())
pos = nx.circular_layout(G)
nx.draw_circular(G)


Networks of 0.1


In [None]:
df = matrix_w.copy()
df[df.abs() < 0.1] = 0
G = nx.from_numpy_array(df.iloc[:nodes_num, :nodes_num].to_numpy(), create_using=nx.DiGraph())
pos = nx.circular_layout(G)
nx.draw_circular(G)


Now what is more important is whether cellbox has a good enough model. Need to check the loss function.


In [None]:
df
