In [1]:
import numpy as np
from clock_project.simulation.magnitude_quantification import calculate_non_stationarity, calculate_ENS, random_nucleotide_distribution, calculate_information, entropy_calculation
from clock_project.maths.evolutionary_rate import calculate_stationary_distribution

from cogent3.maths.measure import jsd
import json
from clock_project.genome_analysis.yapeng_check_BV import get_bounds_violation, load_param_values
import os
import glob
from cogent3 import get_app
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

from cogent3.util.deserialise import deserialise_object
import scipy

load_json_app = get_app("load_json")
bounary_violation_function = get_bounds_violation()

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
alignment_length_dict = {}
alignment_dir = '/Users/gulugulu/repos/PuningAnalysis/data/ensembl_ortholog_sequences/homologies_alignment_common_name_350_threshold'
gene_paths = glob.glob(os.path.join(alignment_dir, '*.json'))
for gene_path in gene_paths:
    file_name = os.path.basename(gene_path).rsplit('.', 1)[0]
    alignment = deserialise_object(json.load(open(gene_path, 'r')))
    alignment_length = alignment.get_lengths()[0]
    alignment_length_dict[file_name] = alignment_length



In [3]:
base_dir = '/Users/gulugulu/Desktop/honours/data_local/whole_genome_mammal87/triads_model_fitting_350_threshold'
gene_paths = glob.glob(os.path.join(base_dir, '*/'))
valid_triads_identifier_dict = {}
for path in gene_paths:
    file_name = os.path.basename(path.rstrip('/'))
    model_fitting_result_dir = os.path.join(path, 'model_fitting_result')
    model_fitting_results_paths = glob.glob(os.path.join(model_fitting_result_dir, '*.json'))
    parameter_proximities = {'proximity_lower': [], 'proximity_upper': [], 'ens': []}
    valid_triads_identifier = []
    for path in model_fitting_results_paths: 
        identifier = os.path.basename(path).rsplit('.', 1)[0]
        model_fitting_result = load_json_app(path)
        param = load_param_values(model_fitting_result)
        exclude_params = ("length", "mprobs")
        list_of_params = param.params
        ens_list = model_fitting_result.get_lengths_as_ens()
        for param in list_of_params:
            if param["par_name"] not in exclude_params:
                proximity_lower = abs(param["init"] - param["lower"])
                proximity_upper = abs(param["init"] - param["upper"])
                ens = ens_list[param['edge']]
                parameter_proximities['ens'].append(ens)
                parameter_proximities['proximity_lower'].append(proximity_lower) 
                parameter_proximities['proximity_upper'].append(proximity_upper)
        bounary_violation_check = bounary_violation_function.main(model_fitting_result)
        if bounary_violation_check.vio == []:
            valid_triads_identifier.append(identifier)
    valid_triads_identifier_dict[file_name] = valid_triads_identifier


Independent variables - evolution time

In [4]:
t_range = np.linspace(0,2,10)

Indepdent variable - information of the initial nucleotide distribution

In [5]:
import os
import json

def extract_internal_root_distribution(base_path):
    gene_dirs = [d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d))]
    all_data = {}

    for gene_id in gene_dirs:
        gene_path = os.path.join(base_path, gene_id)
        internal_root_distribution = {}

        # Files to process
        json_files = ['ens_diff_bins.json', 'jsd_bins.json', 'shorest_ens_bins.json']
        for json_file in json_files:
            file_path = os.path.join(gene_path, json_file)
            if os.path.exists(file_path):
                with open(file_path, 'r') as f:
                    data = json.load(f)
                    bins = data.get('bins', {})
                    for bin_index, content in bins.items():
                        if content: 
                            identifier = content['identifier']
                            triads_info = content['triads_info_big_tree']
                            nuc_freqs_dict = triads_info['nuc_freqs_dict']
                            internal_root_info = nuc_freqs_dict["internal_root_distribution"]
                            internal_root_distribution[str(identifier)] = internal_root_info

            all_data[gene_id] = internal_root_distribution

    return all_data

# Base directory containing all gene ID folders
base_path = '/Users/gulugulu/Desktop/honours/data_local/whole_genome_mammal87/triads_350_threshold'
result = extract_internal_root_distribution(base_path)

# Optionally, save this data to a file
output_file = os.path.join(base_path, 'internal_root_distributions.json')
with open(output_file, 'w') as f:
    json.dump(result, f, indent=4)

In [6]:
information_dict = {}
for gene_id, distirbutions_info in result.items():
    information_dict[gene_id] = {}
    for identifier, distirbution in distirbutions_info.items():
        information_dict[gene_id][identifier] = calculate_information(distirbution)




In [7]:
information_list = [value for gene_id in information_dict for value in information_dict[gene_id].values()]


In [8]:
information_bin = {i: None for i in range(int(0.65 / 0.0065))}
bin_size = 0.0065

In [9]:
for gene_id, all_information in information_dict.items():
    for identifier, information in all_information.items():
        bin_index = int(information // bin_size)
        if information_bin[bin_index] == None:
            information_bin[bin_index] = {'index': (gene_id, identifier, information)}

for bin_index, info in information_bin.items():
    if info != None:
        gene_id, identifier, _ = info['index']
        initial_distribution = result[gene_id][identifier]
        information_bin[bin_index]['distribution'] = initial_distribution

    

Confound (controlled) variable - substitution rate matrix

In [10]:
matrices_dict = {}
matrices_list = []
all_ens_list = []
for gene_name, valid_triads_identifier in valid_triads_identifier_dict.items():
    triads_info_dir = os.path.join(base_dir, gene_name, 'triads_info_dict.json')
    triads_info_dict = json.load(open(triads_info_dir, 'r'))
    matrix_pairs = []
    ens_pairs = []
    for identifier in valid_triads_identifier:
        triads_species_name = triads_info_dict[identifier]['triads_species_names']
        ens_dict = triads_info_dict[identifier]['triads_info_small_tree']['ens']
        marices = triads_info_dict[identifier]['triads_info_small_tree']['matrices']
        matrix_pairs.append({triads_species_name['ingroup1']: np.array(marices[triads_species_name['ingroup1']])* ens_dict[triads_species_name['ingroup1']], triads_species_name['ingroup2']: np.array(marices[triads_species_name['ingroup2']])*ens_dict[triads_species_name['ingroup2']]})
        ens_pairs.append({triads_species_name['ingroup1']: ens_dict[triads_species_name['ingroup1']], triads_species_name['ingroup2']: ens_dict[triads_species_name['ingroup2']]})
    all_ens_list.extend(ens_pairs)
    matrices_list.extend(matrix_pairs)
    matrices_dict[gene_name] = matrix_pairs
    
len(matrices_list)

2671

In [11]:
matrices_list.pop(1738)

{'Megabat': array([[-59.81650541,  50.66196725,   3.07873059,   6.07580757],
        [ 36.65484988, -47.20613229,   8.5359179 ,   2.01536451],
        [  8.75116483,   5.03487332, -74.10248108,  60.31644294],
        [  2.85756663,   7.44404106,  30.41730721, -40.71891491]]),
 'Goat': array([[-4.78519511e-03,  3.39058132e-03,  1.32680216e-03,
          6.78116305e-05],
        [ 3.17629842e-03, -5.61815618e-03,  6.39378945e-05,
          2.37791986e-03],
        [ 7.42983000e-11,  9.87449499e-10, -2.49657169e-03,
          2.49657063e-03],
        [ 8.22296332e-11,  1.12726153e-03,  1.81232000e-03,
         -2.93958161e-03]])}

In [12]:
all_ens_list.pop(1738)

{'Megabat': 51.2979207770548, 'Goat': 0.004176598217943427}

Dependent variable - ENS difference

In [13]:
def get_ens_diff_log_ratio(pi, Q1, Q2, t):
    ens1 = calculate_ENS(pi, Q1, t)
    ens2 = calculate_ENS(pi, Q2, t)
    ens_diff = np.log(ens1/ens2)
    return ens_diff

def get_ens_abs_diff(pi, Q1, Q2, t):
    ens1 = calculate_ENS(pi, Q1, t)
    ens2 = calculate_ENS(pi, Q2, t)
    ens_absdiff = abs(ens1-ens2)
    return ens_absdiff

Dependent variable - Nabla difference

In [14]:
def get_nabla_diff_log_ratio(pi, Q1, Q2, t, ens_o_1, ens_o_2):
    nabla1 = calculate_non_stationarity(pi, Q1, t)/ens_o_1
    nabla2 = calculate_non_stationarity(pi, Q2, t)/ens_o_2
    nabla_diff_log_ratio = np.log(nabla1/nabla2)
    return nabla_diff_log_ratio


Dependent variable - in-group JSD and JSD difference

In [15]:
import scipy.linalg
import scipy.stats

def get_ingroup_jsd(pi, Q1, Q2, t):
    p1 = scipy.linalg.expm(Q1*t)
    p2 = scipy.linalg.expm(Q2*t)
    pi_1 = np.dot(pi,p1)
    pi_2 = np.dot(pi,p2)
    jsd_value = jsd(pi_1, pi_2)
    return jsd_value


def get_jsd_difference(pi, Q1, Q2, t):
    p1 = scipy.linalg.expm(Q1*t)
    p2 = scipy.linalg.expm(Q2*t)
    pi_1 = np.dot(pi,p1)
    pi_2 = np.dot(pi,p2)
    jsd_1 = jsd(pi_1, pi)
    jsd_2 = jsd(pi_2, pi)
    jsd_diff = abs(jsd_1 - jsd_2)
    return jsd_diff

Confounding variable - Initial nucleotide distirbution

In [442]:
#0
pi0 = [0.25, 0.25, 0.25, 0.25]

#0.02
pi1 = [0.22195243534512787,
    0.3100037065350664,
    0.19400175596722694,
    0.27404210215257885
]

#0.14
pi2 = [0.18737214745678638,
    0.3475654097096265,
    0.10013048487375038,
    0.36493195795983835
]

#0.47
pi3 = [0.07780196872922861,
    0.48601378500013337,
    0.045839682353676546,
    0.39034456391696143
]
#0.62
pi4 = [0.414557741723811,
    0.5836667594641209,
    0.010753487466864692,
    0.3414341756517766
]

0.27
pi5 = [0.05, 0.55, 0.05, 0.35]


3D Density Plot

In [443]:
def bar_data(position3d, size=(1,1,1)):
    # Generate the vertices of a parallelepipedic bar at a specified position and size
    bar = np.array([[0, 0, 0], [1, 0, 0], [1, 1, 0], [0, 1, 0],
                    [0, 0, 1], [1, 0, 1], [1, 1, 1], [0, 1, 1]], dtype=float)
    bar *= np.array(size)
    bar += np.array(position3d)
    return bar

def triangulate_bar_faces(positions, sizes):
    # Triangulate the faces of multiple bars to generate vertices and indices for Mesh3d
    all_bars = [bar_data(pos, size) for pos, size in zip(positions, sizes)]
    vertices, ixr = np.unique(np.vstack(all_bars), return_inverse=True, axis=0)
    
    I, J, K = [], [], []
    for k in range(len(all_bars)):
        indices = ixr[k * 8:(k + 1) * 8]
        I.extend(indices[[0, 2, 0, 5, 0, 7, 5, 2, 3, 6, 7, 5]])
        J.extend(indices[[1, 3, 4, 1, 3, 4, 1, 6, 7, 2, 4, 6]])
        K.extend(indices[[2, 0, 5, 0, 7, 0, 2, 5, 6, 3, 5, 7]])
    return vertices, I, J, K

def get_plotly_mesh3d(x, y, bins=[10, 10], bargap=0.1):
    # Generate a 3D histogram plot data
    hist, xedges, yedges = np.histogram2d(x, y, bins=bins)
    xpos, ypos = np.meshgrid(xedges[:-1] + np.diff(xedges) / 2,
                             yedges[:-1] + np.diff(yedges) / 2, indexing="ij")
    
    positions = np.column_stack([xpos.ravel(), ypos.ravel(), np.zeros(xpos.size)])
    sizes = np.column_stack([np.full(xpos.size, xedges[1] - xedges[0] - bargap),
                             np.full(ypos.size, yedges[1] - yedges[0] - bargap),
                             hist.ravel()])
    
    vertices, I, J, K = triangulate_bar_faces(positions, sizes)
    return vertices[:, 0], vertices[:, 1], vertices[:, 2], I, J, K



Create dataframe

In [444]:
def compute_aginst_Q(matrices_list, all_ens_list, pi, t_range):
    result_list = []
    for i in range(len(matrices_list)):
        q_pair = list(matrices_list[i].values())
        ens_pair = list(all_ens_list[i].values())
        Q1 = q_pair[0]
        ens_o_1 = ens_pair[0]
        Q2 = q_pair[1]
        ens_o_2 = ens_pair[1]
        
        for t in t_range:
            jsd_value = get_ingroup_jsd(pi, Q1, Q2, t)
            jsd_diff = get_jsd_difference(pi, Q1, Q2, t)
            ens_abs_diff = get_ens_abs_diff(pi, Q1, Q2, t)
            ens_diff_log_ratio = get_ens_diff_log_ratio(pi, Q1, Q2, t)
            nabla_diff_log_ratio = get_nabla_diff_log_ratio(pi, Q1, Q2, t, ens_o_1, ens_o_2)
            result_list.append((i, t, ens_diff_log_ratio, np.sqrt(ens_abs_diff), nabla_diff_log_ratio, np.sqrt(jsd_value), np.sqrt(jsd_diff)))
    
    df = pd.DataFrame(result_list, columns=['Matrix_ID', 'Time', 'ENS_difference', 'ENS_abs_difference', 'Nabla_difference', 'Ingroup_JSD', 'JSD_difference'])
    return df

df_Q_low = compute_aginst_Q(matrices_list, all_ens_list, pi0, t_range)
df_Q_high = compute_aginst_Q(matrices_list, all_ens_list, pi5, t_range)


In [445]:
time_point_list = [0.5, 1, 1.5, 2]
def compute_against_t(time_interval, matrices_list, ens_list, pi0):
    result_list = []
    for t in time_interval:
        pi = pi0
        for i in range(len(matrices_list)):
            q_pair = list(matrices_list[i].values())
            Q1 = q_pair[0]
            Q2 = q_pair[1]
            ens_pair = list(ens_list[i].values())
            ens_o_1 = ens_pair[0]
            ens_o_2 = ens_pair[1]

            jsd_value = get_ingroup_jsd(pi, Q1, Q2, t)
            jsd_diff = get_jsd_difference(pi, Q1, Q2, t)
            ens_abs_diff = get_ens_abs_diff(pi, Q1, Q2, t)
            ens_diff_log_ratio = get_ens_diff_log_ratio(pi, Q1, Q2, t)
            nabla_diff_log_ratio = get_nabla_diff_log_ratio(pi, Q1, Q2, t, ens_o_1, ens_o_2)
            result_list.append((i, t, ens_diff_log_ratio, np.sqrt(ens_abs_diff), nabla_diff_log_ratio, np.sqrt(jsd_value), np.sqrt(jsd_diff)))
    
    df = pd.DataFrame(result_list, columns=['Matrix_ID', 'Time', 'ENS_difference', 'ENS_abs_difference', 'Nabla_difference', 'Ingroup_JSD', 'JSD_difference'])

    return df


df_t_low = compute_against_t(time_point_list, matrices_list, all_ens_list, pi0)
df_t_high = compute_against_t(time_point_list, matrices_list, all_ens_list, pi5)

Plotting function

In [446]:
def process_data_for_plot(df, x_col, y_col):
    # Remove rows with any NaN values
    df = df.dropna()

    # Convert differences to numpy arrays
    x = df[x_col].to_numpy()
    y = df[y_col].to_numpy()

    # Round the differences and update the DataFrame
    df[x_col] = df[x_col].round(1)
    df[y_col] = df[y_col].round(1)

    # Group by the rounded values and count occurrences
    density_data = df.groupby([x_col, y_col]).size().reset_index(name='Density')

    # Call the prepared function for 3D mesh computation
    X, Y, Z, I, J, K = get_plotly_mesh3d(x, y, bins=[20, 20], bargap=0.05)

    return X, Y, Z, I, J, K, density_data

In [447]:
def create_3d_density_plots(X1, Y1, Z1, I1, J1, K1, X2, Y2, Z2, I2, J2, K2):
    # Create subplots: 1 row, 2 columns
    fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'mesh3d'}, {'type': 'mesh3d'}]],
                        horizontal_spacing=0,
                        subplot_titles=('Low Information', 'High Information'))

    # Add first 3D mesh plot to the first subplot
    fig.add_trace(go.Mesh3d(
        x=X1, y=Y1, z=Z1,
        i=I1, j=J1, k=K1,
        intensity=Z1,  # Typically uses the Z values or another metric for color intensity
        colorscale='Viridis',  # Reversed Viridis color scale; remove '_r' for normal progression
        showscale=False,
        opacity=1,  # Set opacity to make overlaps more discernible
        coloraxis="coloraxis"
    ), row=1, col=1)

    # Add second 3D mesh plot to the second subplot
    fig.add_trace(go.Mesh3d(
        x=X2, y=Y2, z=Z2,
        i=I2, j=J2, k=K2,
        intensity=Z2,  # Typically uses the Z values or another metric for color intensity
        colorscale='Viridis',  # Reversed Viridis color scale; remove '_r' for normal progression
        coloraxis="coloraxis",
        opacity=1  # Set opacity to make overlaps more discernible
    ), row=1, col=2)

    return fig

In [448]:
def create_2d_density_plots(density_data1, density_data2, x_col, y_col):
    # Sort the data by density so that higher density points are plotted last (on top)
    density_data_sorted1 = density_data1.sort_values(by='Density', ascending=True)
    density_data_sorted2 = density_data2.sort_values(by='Density', ascending=True)

    # Create subplots: 1 row, 2 columns
    fig = make_subplots(rows=1, cols=2, subplot_titles=(
        'Low Information',
        'High Information'
    ),
    horizontal_spacing=0.05)

    # Add first scatter plot to the first subplot
    fig.add_trace(go.Scatter(
        x=density_data_sorted1[x_col],
        y=density_data_sorted1[y_col],
        mode='markers',
        marker=dict(
            size=8,  # Adjust size as needed
            color=density_data_sorted1['Density'], 
            colorscale='Viridis', 
            coloraxis="coloraxis",
            opacity=1, 
            showscale=True  
        )
    ), row=1, col=1)

    # Add second scatter plot to the second subplot
    fig.add_trace(go.Scatter(
        x=density_data_sorted2[x_col],
        y=density_data_sorted2[y_col],
        mode='markers',
        marker=dict(
            size=8,  # Adjust size as needed
            color=density_data_sorted2['Density'], 
            colorscale='Viridis', 
            coloraxis="coloraxis",
            opacity=1,  
            showscale=True 
        )
    ), row=1, col=2)

    return fig

In [449]:
import statsmodels.api as sm

def plot_time_grouped_scatter_2x2(df, x_col, y_col):
    times = sorted(df['Time'].unique())

    fig = make_subplots(rows=2, cols=2,
                        subplot_titles=[f'Time = {time}' for time in times],
                        horizontal_spacing=0.05, vertical_spacing=0.2)

    subplot_positions = [(1, 1), (1, 2), (2, 1), (2, 2)]
    for position, time in zip(subplot_positions, times):
        sub_df = df[df['Time'] == time]
        x_data = sub_df[x_col]
        y_data = sub_df[y_col]

        fig.add_trace(go.Scatter(
            x=x_data, 
            y=y_data, 
            mode='markers', 
            name=f'Time {time}',
            marker=dict(
                size=2,  # Adjust size as needed
            )),
            row=position[0], col=position[1])

        # Calculate OLS trendline
        x = sm.add_constant(x_data)  # adding a constant for OLS
        model = sm.OLS(y_data, x).fit()
        trendline = model.predict(x)

        # Add trendline trace
        fig.add_trace(go.Scatter(
            x=x_data, 
            y=trendline,
            mode='lines',
            line=dict(color='red')),
            row=position[0], col=position[1])

        # Annotate R^2 value
        fig.add_annotation(
            xref="x domain", yref="y domain",
            x=0.95, y=0.95,
            text=f"R² = {model.rsquared:.2f}",
            showarrow=False,
            font=dict(size=10, color="red"),
            align="right",
            ax=0, ay=0,
            bordercolor="black",
            borderwidth=1,
            borderpad=4,
            bgcolor="white",
            opacity=0.8,
            row=position[0], col=position[1])

    fig.update_layout(
        template='plotly_white',
        showlegend=False,
        margin=dict(l=20, r=20, t=50, b=20),
        autosize=True
    )

    return fig



Nabla log ratio vs ENS log ratio

In [450]:
X1, Y1, Z1, I1, J1, K1, density_data_1= process_data_for_plot(df_Q_low, 'Nabla_difference', 'ENS_difference')
X2, Y2, Z2, I2, J2, K2, density_data_2 = process_data_for_plot(df_Q_high, 'Nabla_difference', 'ENS_difference')




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [451]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig_density_3d_Q = create_3d_density_plots(X1, Y1, Z1, I1, J1, K1, X2, Y2, Z2, I2, J2, K2)

# Update layout settings and color axis for the entire figure
fig_density_3d_Q.update_layout(
    title="3D Density Histogram of Nabla Difference and ENS Differences across Matrices",
    width=1400, height=800, 
    margin=dict(l=0, r=0, b=0, t=100),
    coloraxis=dict(colorscale='Viridis', colorbar=dict(title='Density', x=1, len=0.75))
)

# Define common scene settings to be used for both subplots
scene_settings = dict(
    camera=dict(eye=dict(x=-1, y=-1, z=3), center=dict(x=0, y=0, z=0), up=dict(x=0, y=0, z=1)),
    xaxis_title='Nabla Difference (log ratio)', yaxis_title='ENS Difference (log ratio)', zaxis_title='Density',
    xaxis=dict(showgrid=True, zeroline=False), yaxis=dict(showgrid=True, zeroline=False), zaxis=dict(showgrid=True, zeroline=False)
)

# Apply common scene settings to both subplots
fig_density_3d_Q.update_scenes(scene_settings, row=1, col=1)
fig_density_3d_Q.update_scenes(scene_settings, row=1, col=2)

# Update subplot titles positioning
for ann in fig_density_3d_Q.layout.annotations:
    ann.update(y=0.95)

fig_density_3d_Q.show()
# fig_density_3d_Q.write_image('3D Density Histogram of Nabla Difference and ENS Differences across Matrices 2.0.pdf')

In [452]:
fig_density_2d_nabla = create_2d_density_plots(density_data_1, density_data_2, 'Nabla_difference', 'ENS_difference')

fig_density_2d_nabla.update_layout(
    title='Density Plot of Nabla Difference and ENS Differences Across Matrix Pairs',
    template='plotly_white',  # Set background to white
    showlegend=False,  # Hide the legend
    width=1100,  # Adjust figure width
    height=500,  # Adjust figure height
    coloraxis=dict(
        colorscale='Viridis',  # Use Viridis color scale
        colorbar=dict(
            title='Density',  # Title for color bar
            x=1,  # Position the color bar
            len=0.75  # Length of the color bar
        )
    )
)

# Consolidated update for axis titles
# Update x-axis and y-axis titles for the subplots
fig_density_2d_nabla.update_xaxes(title_text='Nabla Difference (log ratio)', row=1, col=1)
fig_density_2d_nabla.update_xaxes(title_text='Nabla Difference (log ratio)', row=1, col=2)
fig_density_2d_nabla.update_yaxes(title_text='ENS Difference (log ratio)', row=1, col=1)

# Display the figure
fig_density_2d_nabla.show()
# fig_density_2d_nabla.write_image('2D Density Plot of Nabla Difference and ENS Differences across Matrix Pairs 2.0.pdf')



In [453]:
time_nabla_fig_low = plot_time_grouped_scatter_2x2(df_t_low, 'Nabla_difference', 'ENS_difference')
time_nabla_fig_high = plot_time_grouped_scatter_2x2(df_t_high, 'Nabla_difference', 'ENS_difference')
time_nabla_fig_low.update_layout(
    template='plotly_white',  # Set background to white
    showlegend=False,  # Hide the legend
    width=800,  # Adjust figure width
    height=500,  # Adjust figure height
    coloraxis=dict(
        colorscale='Viridis',  # Use Viridis color scale
        colorbar=dict(
            title='Time',  # Title for color bar
            x=1,  # Position the color bar
            len=0.75  # Length of the color bar
        )
    )
)

# Consolidated update for axis titles
time_nabla_fig_low.update_xaxes(title_text='Nabla Difference (log ratio)', row=2)
time_nabla_fig_low.update_yaxes(title_text='ENS Difference (log ratio)', col=1)
time_nabla_fig_low.update_xaxes(range=[-4, 4])
time_nabla_fig_low.update_yaxes(range=[-2, 2])

time_nabla_fig_low.show()
# time_nabla_fig_low.write_image('nabla_ens_log_ratio_at_time_low.pdf')


In [454]:

time_nabla_fig_high.update_layout(
    template='plotly_white',  # Set background to white
    showlegend=False,  # Hide the legend
    width=800,  # Adjust figure width
    height=500,  # Adjust figure height
    coloraxis=dict(
        colorscale='Viridis',  # Use Viridis color scale
        colorbar=dict(
            title='Time',  # Title for color bar
            x=1,  # Position the color bar
            len=0.75  # Length of the color bar
        )
    )
)

# Consolidated update for axis titles
time_nabla_fig_high.update_xaxes(title_text='Nabla Difference (log ratio)', row=2)
time_nabla_fig_high.update_yaxes(title_text='ENS Difference (log ratio)', col=1)
time_nabla_fig_high.update_xaxes(range=[-4, 4])
time_nabla_fig_high.update_yaxes(range=[-2, 2])

time_nabla_fig_high.show()
# time_nabla_fig_high.write_image('nabla_ens_log_ratio_at_time_high.pdf')


Ingroup_JSD vs ENS differenece

In [455]:
X3, Y3, Z3, I3, J3, K3, density_data_3 = process_data_for_plot(df_Q_low, 'Ingroup_JSD', 'ENS_abs_difference')
X4, Y4, Z4, I4, J4, K4, density_data_4= process_data_for_plot(df_Q_high, 'Ingroup_JSD', 'ENS_abs_difference')




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [456]:
# fig_density_3d_Q_ingroup_jsd = create_3d_density_plots(X3, Y3, Z3, I3, J3, K3, X4, Y4, Z4, I4, J4, K4)

# # Update layout settings and color axis for the entire figure
# fig_density_3d_Q_ingroup_jsd.update_layout(
#     title="3D Density Histogram of Ingroup JSD and ENS Differences across Matrices",
#     width=1400, height=800, 
#     margin=dict(l=0, r=0, b=0, t=100),
#     coloraxis=dict(colorscale='Viridis', colorbar=dict(title='Density', x=1, len=0.75))
# )

# # Define common scene settings to be used for both subplots
# scene_settings = dict(
#     camera=dict(eye=dict(x=-1, y=-1, z=3), center=dict(x=0, y=0, z=0), up=dict(x=0, y=0, z=1)),
#     xaxis_title='Ingroup JSD', yaxis_title='ENS Difference', zaxis_title='Density',
#     xaxis=dict(showgrid=True, zeroline=False), yaxis=dict(showgrid=True, zeroline=False), zaxis=dict(showgrid=True, zeroline=False)
# )

# # Apply common scene settings to both subplots
# fig_density_3d_Q_ingroup_jsd.update_scenes(scene_settings, row=1, col=1)
# fig_density_3d_Q_ingroup_jsd.update_scenes(scene_settings, row=1, col=2)

# # Update subplot titles positioning
# for ann in fig_density_3d_Q.layout.annotations:
#     ann.update(y=0.95)

# fig_density_3d_Q_ingroup_jsd.show()

In [457]:



fig_density_2d_ingroup_jsd = create_2d_density_plots(density_data_3[density_data_3['Density'] > 1], density_data_4[density_data_4['Density'] > 1], 'Ingroup_JSD', 'ENS_abs_difference')

# Update layout for the entire figure
fig_density_2d_ingroup_jsd.update_layout(
    title='Density Plot of Ingroup JSD and ENS Differences Across Matrix Pairs',
    template='plotly_white',  # Set background to white for better visibility
    showlegend=False,  # Hide legend
    width=1100,  # Adjust width
    height=500,
        coloraxis=dict(
        colorscale='Viridis',  # Viridis color scale
        colorbar=dict(
            title='Density',
            x=1,  # Position the color bar to the right of the plots
            len=0.75
        )
))

# Update x-axis and y-axis titles for the subplots
fig_density_2d_ingroup_jsd.update_xaxes(title_text='Ingroup JSD (sqrt transformed)', row=1, col=1)
fig_density_2d_ingroup_jsd.update_xaxes(title_text='Ingroup JSD (sqrt transformed)', row=1, col=2)
fig_density_2d_ingroup_jsd.update_yaxes(title_text='ENS Difference (sqrt transformed)', row=1, col=1)
fig_density_2d_ingroup_jsd.update_xaxes(range=[0, 0.2])
fig_density_2d_ingroup_jsd.update_yaxes(range=[0, 0.7])

# Show the figure
fig_density_2d_ingroup_jsd.show()
# fig_density_2d_ingroup_jsd.write_image('2D Density Plot of Ingroup and ENS Differences across Matrix Pairs > 1.pdf')



In [458]:
time_ingroupjsd_fig_low = plot_time_grouped_scatter_2x2(df_t_low, 'Ingroup_JSD', 'ENS_abs_difference')
time_ingroupjsd_fig_high = plot_time_grouped_scatter_2x2(df_t_high, 'Ingroup_JSD', 'ENS_abs_difference')


In [459]:
time_ingroupjsd_fig_low.update_layout(
    template='plotly_white',  # Set background to white
    showlegend=False,  # Hide the legend
    width=800,  # Adjust figure width
    height=500,  # Adjust figure height
    coloraxis=dict(
        colorscale='Viridis',  # Use Viridis color scale
        colorbar=dict(
            title='Time',  # Title for color bar
            x=1,  # Position the color bar
            len=0.75  # Length of the color bar
        )
    )
)

# Consolidated update for axis titles
time_ingroupjsd_fig_low.update_xaxes(title_text='Ingroup JSD(sqrt transformed)', row=2)
time_ingroupjsd_fig_low.update_yaxes(title_text='sqrt(ENS Difference)', col=1)
time_ingroupjsd_fig_low.update_xaxes(range=[0, 0.2])
time_ingroupjsd_fig_low.update_yaxes(range=[0, 0.7])

# time_ingroupjsd_fig_low.write_image('ingroup_jsd_ens_at_time_low.pdf')
time_ingroupjsd_fig_low.show()

In [460]:
time_ingroupjsd_fig_high.update_layout(
    template='plotly_white',  # Set background to white
    showlegend=False,  # Hide the legend
    width=800,  # Adjust figure width
    height=500,  # Adjust figure height
    coloraxis=dict(
        colorscale='Viridis',  # Use Viridis color scale
        colorbar=dict(
            title='Time',  # Title for color bar
            x=1,  # Position the color bar
            len=0.75  # Length of the color bar
        )
    )
)

# Consolidated update for axis titles
time_ingroupjsd_fig_high.update_xaxes(title_text='Ingroup JSD(sqrt transformed)', row=2)
time_ingroupjsd_fig_high.update_yaxes(title_text='sqrt(ENS Difference)', col=1)
time_ingroupjsd_fig_high.update_xaxes(range=[0, 0.2])
time_ingroupjsd_fig_high.update_yaxes(range=[0, 0.7])

# time_ingroupjsd_fig_high.write_image('ingroup_jsd_ens_at_time_high.pdf')
time_ingroupjsd_fig_high.show()

JSD difference vs ENS difference

In [461]:
X5, Y5, Z5, I5, J5, K5, density_data_5 = process_data_for_plot(df_Q_low, 'JSD_difference', 'ENS_abs_difference')
X6, Y6, Z6, I6, J6, K6, density_data_6 = process_data_for_plot(df_Q_high, 'JSD_difference', 'ENS_abs_difference')




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [462]:
# fig_density_3d_Q_jsd_difference = create_3d_density_plots(X5, Y5, Z5, I5, J5, K5, X6, Y6, Z6, I6, J6, K6)

# # Update layout settings and color axis for the entire figure
# fig_density_3d_Q_jsd_difference.update_layout(
#     title="3D Density Histogram of JSD Difference and ENS Differences across Matrices",
#     width=1400, height=800, 
#     margin=dict(l=0, r=0, b=0, t=100),
#     coloraxis=dict(colorscale='Viridis', colorbar=dict(title='Density', x=1, len=0.75))
# )

# # Define common scene settings to be used for both subplots
# scene_settings = dict(
#     camera=dict(eye=dict(x=-1, y=-1, z=3), center=dict(x=0, y=0, z=0), up=dict(x=0, y=0, z=1)),
#     xaxis_title='JSD Difference', yaxis_title='ENS Difference', zaxis_title='Density',
#     xaxis=dict(showgrid=True, zeroline=False), yaxis=dict(showgrid=True, zeroline=False), zaxis=dict(showgrid=True, zeroline=False)
# )

# # Apply common scene settings to both subplots
# fig_density_3d_Q_jsd_difference.update_scenes(scene_settings, row=1, col=1)
# fig_density_3d_Q_jsd_difference.update_scenes(scene_settings, row=1, col=2)

# # Update subplot titles positioning
# for ann in fig_density_3d_Q_jsd_difference.layout.annotations:
#     ann.update(y=0.95)

# fig_density_3d_Q_jsd_difference.show()

In [463]:
fig_density_2d_jsd_diff = create_2d_density_plots(density_data_5[density_data_5['Density'] > 1], density_data_6[density_data_6['Density'] > 1], 'JSD_difference', 'ENS_abs_difference')

# Update layout for the entire figure
fig_density_2d_jsd_diff.update_layout(
    title='Density Plot of JSD Difference and ENS Differences Across Matrix Pairs',
    template='plotly_white',  # Set background to white for better visibility
    showlegend=False,  # Hide legend
    width=1100,  # Adjust width
    height=500,
        coloraxis=dict(
        colorscale='Viridis',  # Viridis color scale
        colorbar=dict(
            title='Density',
            x=1,  # Position the color bar to the right of the plots
            len=0.75
        )
))

# Update x-axis and y-axis titles for the subplots
fig_density_2d_jsd_diff.update_xaxes(title_text='JSD Difference (sqrt transformed)', row=1, col=1)
fig_density_2d_jsd_diff.update_xaxes(title_text='JSD Difference (sqrt transformed)', row=1, col=2)
fig_density_2d_jsd_diff.update_yaxes(title_text='ENS Difference (sqrt transformed)', row=1, col=1)
fig_density_2d_jsd_diff.update_xaxes(range=[0, 0.25])
fig_density_2d_jsd_diff.update_yaxes(range=[0, 0.7])
# Show the figure
fig_density_2d_jsd_diff.show()
# fig_density_2d_jsd_diff.write_image('2D Density Plot of JSD Difference and ENS Differences across Matrix Pairs > 1.pdf')



In [464]:
time_jsd_diff_fig_low = plot_time_grouped_scatter_2x2(df_t_low, 'JSD_difference', 'ENS_abs_difference')
time_jsd_diff_fig_high = plot_time_grouped_scatter_2x2(df_t_high, 'JSD_difference', 'ENS_abs_difference')
time_jsd_diff_fig_low.update_layout(
    template='plotly_white',  # Set background to white
    showlegend=False,  # Hide the legend
    width=800,  # Adjust figure width
    height=500,  # Adjust figure height
    coloraxis=dict(
        colorscale='Viridis',  # Use Viridis color scale
        colorbar=dict(
            title='Time',  # Title for color bar
            x=1,  # Position the color bar
            len=0.75  # Length of the color bar
        )
    )
)

# Consolidated update for axis titles
time_jsd_diff_fig_low.update_xaxes(title_text='JSD Difference (sqrt transformed)', range=[0, 0.4], row=2)
time_jsd_diff_fig_low.update_yaxes(title_text='sqrt(ENS Difference)', col=1)
time_jsd_diff_fig_low.update_xaxes(range=[0, 0.2])
time_jsd_diff_fig_low.update_yaxes(range=[0, 0.7])



# time_jsd_diff_fig_low.write_image('jsd_ens_diff_at_time_low.pdf')
time_jsd_diff_fig_low.show()

In [465]:
time_jsd_diff_fig_high.update_layout(
    template='plotly_white',  # Set background to white
    showlegend=False,  # Hide the legend
    width=800,  # Adjust figure width
    height=500,  # Adjust figure height
    coloraxis=dict(
        colorscale='Viridis',  # Use Viridis color scale
        colorbar=dict(
            title='Time',  # Title for color bar
            x=1,  # Position the color bar
            len=0.75  # Length of the color bar
        )
    )
)

# Consolidated update for axis titles
time_jsd_diff_fig_high.update_xaxes(title_text='JSD Difference (sqrt transformed)', row=2)
time_jsd_diff_fig_high.update_yaxes(title_text='sqrt(ENS Difference)', col=1)
time_jsd_diff_fig_high.update_xaxes(range=[0, 0.3])
time_jsd_diff_fig_high.update_yaxes(range=[0, 0.7])

# time_jsd_diff_fig_high.write_image('jsd_ens_diff_at_time_high.pdf')
time_jsd_diff_fig_high.show()

In [466]:
# import pandas as pd
# import numpy as np
# import plotly.graph_objects as go
# from plotly.subplots import make_subplots

# # Assuming df_jsd_low and df_jsd_high are the dataframes for low and high information respectively

# # Defining bins and bin labels for low information
# bins_low = np.linspace(0, 0.3, 20)
# bin_labels_low = np.round((bins_low[:-1] + bins_low[1:]) / 2, 2)  # Calculate midpoints

# # Binning Ingroup_JSD for low information
# df_jsd_low['Ingroup_JSD'] = pd.cut(df_jsd_low['Ingroup_JSD'], bins=bins_low, labels=bin_labels_low)
# pivot_table_low = df_jsd_low.pivot_table(index='Time', columns='Ingroup_JSD', values='ENS_difference', aggfunc='mean')

# # Defining bins and bin labels for high information
# bins_high = np.linspace(0, 0.4, 20)
# bin_labels_high = np.round((bins_high[:-1] + bins_high[1:]) / 2, 2)  # Calculate midpoints

# # Binning JSD_difference for high information
# df_jsd_high['Ingroup_JSD'] = pd.cut(df_jsd_high['Ingroup_JSD'], bins=bins_high, labels=bin_labels_high)
# pivot_table_high = df_jsd_high.pivot_table(index='Time', columns='Ingroup_JSD', values='ENS_difference', aggfunc='mean')

# # Extracting the X, Y, Z coordinates for Plotly for low information
# X_low, Y_low = np.meshgrid(pivot_table_low.columns.categories, pivot_table_low.index)
# Z_low = pivot_table_low.values

# # Extracting the X, Y, Z coordinates for Plotly for high information
# X_high, Y_high = np.meshgrid(pivot_table_high.columns.categories, pivot_table_high.index)
# Z_high = pivot_table_high.values

# # Create subplots: 1 row, 2 columns
# fig = make_subplots(
#     rows=1, cols=2,
#     specs=[[{'type': 'surface'}, {'type': 'surface'}]],
#     horizontal_spacing=0,  # Adjust the spacing as needed
#     subplot_titles=(
#         'Low Information',
#         'High Information'
#     )
# )

# # Add the first surface plot (Low Information) to the first subplot
# fig.add_trace(go.Surface(
#     z=Z_low,
#     x=X_low[0],  # X coordinates
#     y=Y_low[:, 0],  # Y coordinates
#     colorscale='Viridis',
#     showscale=False), row=1, col=1)

# # Add the second surface plot (High Information) to the second subplot
# fig.add_trace(go.Surface(
#     z=Z_high,
#     x=X_high[0],  # X coordinates
#     y=Y_high[:, 0],  # Y coordinates
#     colorscale='Viridis',
#     showscale=True,
#     surfacecolor=Y_high,  # Use Y axis values (Time) for color scale
#     colorbar=dict(title='Time', x=1.05)  # Adjust the colorbar position
# ), row=1, col=2)

# # Update layout with axis titles
# fig.update_layout(
#     title='3D Surface Plots of ENS difference and Ingroup JSD over Time',
#     autosize=False,
#     width=1600,  # Adjust the width to fit the plots better
#     height=800,  # Adjust the height to fit the plots better
#     margin=dict(l=0, r=0, b=0, t=100),  # Adjust margins to make full use of space
#     scene=dict(camera=dict(
#             eye=dict(x=1, y=-2, z=2),  # Adjust x, y, z to change the camera angle
#             center=dict(x=0, y=0, z=0),  # Keeps the center of the plot at the origin
#             up=dict(x=0, y=0, z=1)  # Ensures that z is up
#         ),
#         xaxis_title='Ingroup JSD (Sqrt transformed)',
#         yaxis_title='Time',
#         zaxis_title='Absolute ENS Difference (Sqrt transformed)',
#         aspectmode='cube'
#     ),
#     scene2=dict(camera=dict(
#             eye=dict(x=1, y=-2, z=2),  # Adjust x, y, z to change the camera angle
#             center=dict(x=0, y=0, z=0),  # Keeps the center of the plot at the origin
#             up=dict(x=0, y=0, z=1)  # Ensures that z is up
#         ),
#         xaxis_title='Ingroup JSD (Sqrt transformed)',
#         yaxis_title='Time',
#         zaxis_title='Absolute ENS Difference (Sqrt transformed)',
#         aspectmode='cube'
#     )
# )
# # 
# # Show the figure
# fig.show()

# # Save the figure as PDF
# fig.write_image('3D Surface Plots of ENS difference and Ingroup JSD with respect to Time.pdf')


In [467]:
# import pandas as pd
# import numpy as np
# import plotly.graph_objects as go
# from plotly.subplots import make_subplots

# # Assuming df_jsd_low and df_jsd_high are the dataframes for low and high information respectively

# # Defining bins and bin labels for low information
# bins_low = np.linspace(0, 0.3, 20)
# bin_labels_low = np.round((bins_low[:-1] + bins_low[1:]) / 2, 2)  # Calculate midpoints

# # Binning Ingroup_JSD for low information
# df_jsd_low['JSD_difference'] = pd.cut(df_jsd_low['JSD_difference'], bins=bins_low, labels=bin_labels_low)
# pivot_table_low = df_jsd_low.pivot_table(index='Time', columns='JSD_difference', values='ENS_difference', aggfunc='mean')

# # Defining bins and bin labels for high information
# bins_high = np.linspace(0, 0.4, 20)
# bin_labels_high = np.round((bins_high[:-1] + bins_high[1:]) / 2, 2)  # Calculate midpoints

# # Binning JSD_difference for high information
# df_jsd_high['JSD_difference'] = pd.cut(df_jsd_high['JSD_difference'], bins=bins_high, labels=bin_labels_high)
# pivot_table_high = df_jsd_high.pivot_table(index='Time', columns='JSD_difference', values='ENS_difference', aggfunc='mean')

# # Extracting the X, Y, Z coordinates for Plotly for low information
# X_low, Y_low = np.meshgrid(pivot_table_low.columns.categories, pivot_table_low.index)
# Z_low = pivot_table_low.values

# # Extracting the X, Y, Z coordinates for Plotly for high information
# X_high, Y_high = np.meshgrid(pivot_table_high.columns.categories, pivot_table_high.index)
# Z_high = pivot_table_high.values

# # Create subplots: 1 row, 2 columns
# fig = make_subplots(
#     rows=1, cols=2,
#     specs=[[{'type': 'surface'}, {'type': 'surface'}]],
#     horizontal_spacing=0,  # Adjust the spacing as needed
#     subplot_titles=(
#         'Low Information',
#         'High Information'
#     )
# )

# # Add the first surface plot (Low Information) to the first subplot
# fig.add_trace(go.Surface(
#     z=Z_low,
#     x=X_low[0],  # X coordinates
#     y=Y_low[:, 0],  # Y coordinates
#     colorscale='Viridis',
#     showscale=False), row=1, col=1)

# # Add the second surface plot (High Information) to the second subplot
# fig.add_trace(go.Surface(
#     z=Z_high,
#     x=X_high[0],  # X coordinates
#     y=Y_high[:, 0],  # Y coordinates
#     colorscale='Viridis',
#     showscale=True,
#     surfacecolor=Y_high,  # Use Y axis values (Time) for color scale
#     colorbar=dict(title='Time', x=1.05)  # Adjust the colorbar position
# ), row=1, col=2)

# # Update layout with axis titles
# fig.update_layout(
#     title='3D Surface Plots of ENS difference and JSD Difference over Time',
#     autosize=False,
#     width=1600,  # Adjust the width to fit the plots better
#     height=800,  # Adjust the height to fit the plots better
#     margin=dict(l=0, r=0, b=0, t=100),  # Adjust margins to make full use of space
#     scene=dict(camera=dict(
#             eye=dict(x=1, y=-2, z=2),  # Adjust x, y, z to change the camera angle
#             center=dict(x=0, y=0, z=0),  # Keeps the center of the plot at the origin
#             up=dict(x=0, y=0, z=1)  # Ensures that z is up
#         ),
#         xaxis_title='JSD Difference (Sqrt transformed)',
#         yaxis_title='Time',
#         zaxis_title='Absolute ENS Difference (Sqrt transformed)',
#         aspectmode='cube'
#     ),
#     scene2=dict(camera=dict(
#             eye=dict(x=1, y=-2, z=2),  # Adjust x, y, z to change the camera angle
#             center=dict(x=0, y=0, z=0),  # Keeps the center of the plot at the origin
#             up=dict(x=0, y=0, z=1)  # Ensures that z is up
#         ),
#         xaxis_title='JSD Difference (Sqrt transformed)',
#         yaxis_title='Time',
#         zaxis_title='Absolute ENS Difference (Sqrt transformed)',
#         aspectmode='cube'
#     )
# )

# # Show the figure
# fig.show()

# # Save the figure as PDF
# fig.write_image('3D Surface Plots of ENS difference and JSD Difference with respect to Time.pdf')


In [468]:
t = 0.5
pi = pi0
ens_difference_list = []
jsd_list = []
jsd_difference_list = []
for i in range(len(matrices_list)):
    q_pair = list(matrices_list[i].values())
    ens_o_1 = list(all_ens_list[i].values())[0]
    ens_o_2 = list(all_ens_list[i].values())[1]
    Q1 = q_pair[0]
    Q2 = q_pair[1]
    ens1 = calculate_ENS(pi, Q1, t)
    ens2 = calculate_ENS(pi, Q2, t)
    p1 = scipy.linalg.expm(Q1*t)
    p2 = scipy.linalg.expm(Q2*t)
    pi_1 = np.dot(pi,p1)
    pi_2 = np.dot(pi,p2)
    jsd_value = np.sqrt(jsd(pi_1, pi_2))
    jsd_1 = jsd(pi_1, pi)
    jsd_2 = jsd(pi_2, pi)
    jsd_diff = np.sqrt(abs(jsd_1-jsd_2))
    ens_diff = np.sqrt(abs(ens1-ens2))
    jsd_list.append(jsd_value)
    jsd_difference_list.append(jsd_diff)
    ens_difference_list.append(ens_diff)
    

In [469]:
nabla_ens_log_ratio_fig = px.scatter(x = jsd_list, y = ens_difference_list, labels={'x':'jsd_diff', 'y':'ens_diff'}, trendline="ols", title= None)
# Update layout with labels and title
nabla_ens_log_ratio_fig.update_layout(
    template='plotly_white',
    margin=dict(l=20, r=20, t=50, b=20),
    autosize=True,
    yaxis_title_font={'size': 20},  
    xaxis_title_font={'size': 20}, 
    width=None 
)
nabla_ens_log_ratio_fig.show()

In [470]:
import statsmodels.api as sm

data = {
    'jsd_diff': jsd_list,  # Jensen-Shannon divergence values
    'ens_diff': ens_difference_list  # Ensemble difference values
}
df = pd.DataFrame(data)
X = sm.add_constant(df['jsd_diff'])
y = df['ens_diff']
model = sm.OLS(y, X).fit()
model.rsquared

0.14286421440087416

In [471]:
stationary_distirbution_list = []
for matrix_pair in matrices_list:
    stationary_distirbution_dict = {}
    for species, matrix in matrix_pair.items():
        stationary_distirbution_dict[species] = calculate_stationary_distribution(matrix)
    stationary_distirbution_list.append(stationary_distirbution_dict)

In [472]:
information_list2 = []
for stationary_distribution_dict in stationary_distirbution_list:
    for distribution in list(stationary_distribution_dict.values()):
        information_list2.append(calculate_information(distribution))

In [473]:
px.histogram(information_list2)

In [474]:
np.median(information_list2)

0.09076580305586168

In [475]:
calculate_information(pi5)

0.5633335180833126

In [476]:
valid_triads_number_dict = {}
for gene_name, valid_list in valid_triads_identifier_dict.items():
    valid_triads_number_dict[gene_name] = len(valid_list)

keys = alignment_length_dict.keys()
lengths = [alignment_length_dict[key] for key in keys]
triad_counts = [valid_triads_number_dict[key] for key in keys]
import plotly.express as px
aln_len_valid_matrix_fig = px.scatter(x = lengths, y = triad_counts, labels={'x':'Alignment length', 'y':'Number of valid dataset'}, title= None)
# Update layout with labels and title
aln_len_valid_matrix_fig.update_layout(
    template='plotly_white',
    margin=dict(l=20, r=20, t=50, b=20),
    autosize=True,
    yaxis_title_font={'size': 20},  
    xaxis_title_font={'size': 20}, 
    width=None 
)
aln_len_valid_matrix_fig.show()