In [1]:
import numpy as np
import scipy
from clock_project.simulation.magnitude_quantification import calculate_information
import scipy
import json
from cogent3.util.deserialise import deserialise_object
import numpy as np
import os
import glob

import plotly.express as px
from cogent3.maths.measure import jsd




In [2]:
base_dir = '/Users/gulugulu/Desktop/honours/data_local_2/triples_model_fitting_550_threshold'
gene_paths = glob.glob(os.path.join(base_dir, '*/'))


In [3]:
alignment_length = json.load(open('/Users/gulugulu/repos/PuningAnalysis/results/output_data/genome_information/alignment_lengths_550_threshold.json', 'r'))
number_of_species = json.load(open('/Users/gulugulu/repos/PuningAnalysis/results/output_data/genome_information/sample_sizes_full.json', 'r'))

In [4]:
alignment_length_list = [int(alignment_length[gene]) for gene in alignment_length]
number_of_species_list = [int(number_of_species[gene]) for gene in alignment_length]

In [5]:
# Create the histogram with density normalization
fig = px.histogram(
    alignment_length_list,
    labels={'x': 'Alignment Length (3rd codon position)', 'y': 'Density'},
    title=None,
    color_discrete_sequence=['#F4A300'],  # Set the color to a shade of orange
    histnorm='percent'  # Normalize the histogram to density
)   

# Update layout for presentation
fig.update_layout(
    template='plotly_white',
    margin=dict(l=50, r=50, t=50, b=50),  # Adjust margins for a balanced look
    autosize=True,
    yaxis_title='<b>Density</b>',  # Explicit y-axis title
    xaxis_title='<b>Alignment Length (3rd codon position)</b>',  # Explicit x-axis title
    yaxis_title_font=dict(size=20),  # Adjust y-axis font size
    xaxis_title_font=dict(size=20),  # Adjust x-axis font size
    font=dict(size=16),  # General font size for labels and titles
    width=800,  # Set figure width (optional for better control)
    height=500,  # Set figure height (optional for better control)
    showlegend=False  # Remove the legend
)

# Set transparency level and add a solid line around each bar
fig.update_traces(
    opacity=0.8,  # Set the transparency (0 = fully transparent, 1 = fully opaque)
    marker_line_color='black',  # Color of the line around each bar
    marker_line_width=1.5  # Width of the line around each bar
)

fig.update_xaxes(range=[470, 2900])

fig.write_image('alignment_length_histogram.pdf')
fig.show()

In [6]:
# Create the histogram with density normalization
fig = px.histogram(
    number_of_species_list,
    labels={'x': 'Number of Species', 'y': 'Density'},
    title=None,
    color_discrete_sequence=['#F4A300'],  # Set the color to a shade of orange
    histnorm='density'  # Normalize the histogram to density
)   

# Update layout for presentation
fig.update_layout(
    template='plotly_white',
    margin=dict(l=50, r=50, t=50, b=50),  # Adjust margins for a balanced look
    autosize=True,
    yaxis_title='<b>Density</b>',  # Explicit y-axis title
    xaxis_title='<b>Number of Species</b>',  # Explicit x-axis title
    yaxis_title_font=dict(size=20),  # Adjust y-axis font size
    xaxis_title_font=dict(size=20),  # Adjust x-axis font size
    font=dict(size=16),  # General font size for labels and titles
    width=800,  # Set figure width (optional for better control)
    height=500,  # Set figure height (optional for better control)
    showlegend=False  # Remove the legend
)

# Set transparency level and add a solid line around each bar
fig.update_traces(
    opacity=0.8,  # Set the transparency (0 = fully transparent, 1 = fully opaque)
    marker_line_color='black',  # Color of the line around each bar
    marker_line_width=1.5  # Width of the line around each bar
)

fig.update_xaxes(range=[30, 66])

fig.write_image('number_of_species_histogram.pdf')
fig.show()

In [7]:
def get_ingroup_names(triads_info):
    ingroup_names_dict = {}
    for identifier, info in triads_info.items():
        triads_names = info['triples_species_names']
        ingroup_names_dict[identifier] = [triads_names['ingroup1'], triads_names['ingroup2']]
    return ingroup_names_dict

In [8]:
def get_jsd_diff(triads_info):
    jsd_diff_dict = {}
    for identifier, info in triads_info.items():
        triads_info_value = info['triples_info_small_tree']
        nuc_freqs_dict = triads_info_value['nuc_freqs_dict']
        nuc_freq1 = nuc_freqs_dict['ingroup1']
        nuc_freq2 = nuc_freqs_dict['ingroup2']
        nuc_freq_internal_node = nuc_freqs_dict["internal_node"]
        jsd1 = jsd(nuc_freq1, nuc_freq_internal_node)
        jsd2 = jsd(nuc_freq2, nuc_freq_internal_node)
        jsd_diff = abs(jsd1 - jsd2)
        jsd_diff_dict[identifier] = jsd_diff
    return jsd_diff_dict

def get_jsd(triads_info):
    jsd_dict = {}
    for identifier, info in triads_info.items():
        triads_info_value = info['triples_info_small_tree']
        nuc_freqs_dict = triads_info_value['nuc_freqs_dict']
        nuc_freq1 = nuc_freqs_dict['ingroup1']
        nuc_freq2 = nuc_freqs_dict['ingroup2']
        nuc_freq_internal_node = nuc_freqs_dict["internal_node"]
        jsd1 = jsd(nuc_freq1, nuc_freq_internal_node)
        jsd2 = jsd(nuc_freq2, nuc_freq_internal_node)
        jsd_dict[identifier] = {'ingroup1': jsd1, 'ingroup2': jsd2}
    return jsd_dict
        

In [9]:
from scipy.stats import wasserstein_distance

def get_ingroup_jsd(triads_info):
    ingroup_jsd_dict = {}
    for identifier, info in triads_info.items():
        triads_info_value = info['triples_info_small_tree']
        ingroup_jsd = triads_info_value['ingroup_jsd']
        ingroup_jsd_dict[identifier] = ingroup_jsd
    return ingroup_jsd_dict

def get_ingroup_wst(triads_info):
    ingroup_wst_dict = {}
    for identifier, info in triads_info.items():
        triads_info_value = info['triples_info_small_tree']
        nuc_freqs_dict = triads_info_value['nuc_freqs_dict']
        ingroup1_freq = nuc_freqs_dict['ingroup1']
        ingroup2_freq = nuc_freqs_dict['ingroup2']
        ingroup_wst = wasserstein_distance(ingroup1_freq, ingroup2_freq)
        ingroup_wst_dict[identifier] = ingroup_wst

    return ingroup_wst_dict

def get_wst_diff(triads_info):
    wst_diff_dict = {}
    for identifier, info in triads_info.items():
        triads_info_value = info['triples_info_small_tree']
        nuc_freqs_dict = triads_info_value['nuc_freqs_dict']
        ingroup1_freq = nuc_freqs_dict['ingroup1']
        ingroup2_freq = nuc_freqs_dict['ingroup2']
        internal_node_freq = nuc_freqs_dict['internal_node']
        wst1 = wasserstein_distance(ingroup1_freq, internal_node_freq)
        wst2 = wasserstein_distance(ingroup2_freq, internal_node_freq)
        wst_diff = abs(wst1-wst2)
        wst_diff_dict[identifier] = wst_diff

    return wst_diff_dict

In [10]:
def get_ingroup_ens_diff(triads_info):
    ens_ingroup_dict = {}
    for identifier, info in triads_info.items():
        triads_info_value = info['triples_info_small_tree']
        triads_names = info['triples_species_names']
        ens_dict = triads_info_value['ens']
        ens_ingroup = abs(np.log(ens_dict[triads_names['ingroup1']]/ ens_dict[triads_names['ingroup2']]))
        ens_ingroup_dict[identifier] = ens_ingroup
    return ens_ingroup_dict

In [11]:
def get_ingroup_ens_absdiff(triads_info):
    ens_ingroup_dict = {}
    for identifier, info in triads_info.items():
        triads_info_value = info['triples_info_small_tree']
        triads_names = info['triples_species_names']
        ens_dict = triads_info_value['ens']
        ens_ingroup = abs(ens_dict[triads_names['ingroup1']]- ens_dict[triads_names['ingroup2']])
        ens_ingroup_dict[identifier] = ens_ingroup
    return ens_ingroup_dict

def get_ens(triads_info):
    ens_value_dict = {}
    for identifier, info in triads_info.items():
        triads_info_value = info['triples_info_small_tree']
        triads_names = info['triples_species_names']
        ens_dict = triads_info_value['ens']
        ens1 = ens_dict[triads_names['ingroup1']]
        ens2 = ens_dict[triads_names['ingroup2']]
        ens_value_dict[identifier] = {'ingroup1': ens1, 'ingroup2': ens2}
    return ens_value_dict


In [12]:
def get_ingroup_ens_Hellinger(triads_info):
    ens_ingroup_dict = {}
    for identifier, info in triads_info.items():
        triads_info_value = info['triples_info_small_tree']
        triads_names = info['triples_species_names']
        ens_dict = triads_info_value['ens']
        ens_ingroup = np.sqrt(2*(np.sqrt(ens_dict[triads_names['ingroup1']])- np.sqrt(ens_dict[triads_names['ingroup2']]))**2)
        ens_ingroup_dict[identifier] = ens_ingroup
    return ens_ingroup_dict

In [13]:
def get_nabla_absdiff(triads_info):
    nabla_diff_dict = {}
    for identifier, info in triads_info.items():
        triads_info_value = info['triples_info_small_tree']
        triads_names = info['triples_species_names']
        nabla_dict = triads_info_value['nabla_values']
        nbala_diff = abs(nabla_dict[triads_names['ingroup1']] - nabla_dict[triads_names['ingroup2']])
        nabla_diff_dict[identifier] = nbala_diff
    return nabla_diff_dict

In [14]:
def get_ingroup_nabla(triads_info):
    nabla_ingroup_dict = {}
    for identifier, info in triads_info.items():
        triads_info_value = info['triples_info_small_tree']
        triads_names = info['triples_species_names']
        nabla_dict = triads_info_value['nabla_values']
        ens_dict = triads_info_value['ens']
        nabla_difference = abs(np.log(nabla_dict[triads_names['ingroup1']]*ens_dict[triads_names['ingroup1']]/nabla_dict[triads_names['ingroup2']]*ens_dict[triads_names['ingroup2']]))
        nabla_ingroup_dict[identifier] = nabla_difference
    return nabla_ingroup_dict

In [15]:
# alignment_length_dict = {}
# alignment_dir_paths = glob.glob(os.path.join(alignment_dir, '*.json'))
# for path in alignment_dir_paths:
#     file_name = os.path.basename(path).rsplit('.', 1)[0]
#     alignment = deserialise_object(json.load(open(path, 'r')))
#     alignment_length_dict[file_name] = alignment.get_lengths()[0]


In [16]:
def remove_outliers_iqr(data1, data2):
    def compute_iqr_bounds(data):
        Q1 = np.percentile(data, 25)
        Q3 = np.percentile(data, 75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        return lower_bound, upper_bound

    # Calculate IQR bounds for both lists
    lower_bound1, upper_bound1 = compute_iqr_bounds(data1)
    lower_bound2, upper_bound2 = compute_iqr_bounds(data2)

    # Filter out pairs where either value is an outlier
    filtered_data1 = []
    filtered_data2 = []

    for val1, val2 in zip(data1, data2):
        if (lower_bound1 <= val1 <= upper_bound1) and (lower_bound2 <= val2 <= upper_bound2):
            filtered_data1.append(val1)
            filtered_data2.append(val2)

    return filtered_data1, filtered_data2

In [17]:
import os
import json
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Function to load JSON data
def load_json_data(path):
    with open(path, 'r') as file:
        return json.load(file)

# Function to compute required values
def compute_values(path):
    nabla_log_ratio_list = []
    ens_log_ratio_list = []

    triads_data_path = os.path.join(path, 'triples_info_dict.json')
    triads_info = load_json_data(triads_data_path)
    nabla_log_ratio_dict = get_ingroup_nabla(triads_info)
    ens_log_ratio_dict = get_ingroup_ens_diff(triads_info)
    ens_abs_diff_dict = get_ingroup_ens_absdiff(triads_info)
    jsd_diff_dict = get_jsd_diff(triads_info)
    ens_dict = get_ens(triads_info)
    jsd_dict = get_jsd(triads_info)
    ingroup_jsd_dict = get_ingroup_jsd(triads_info)
    nabla_absdiff_dict = get_nabla_absdiff(triads_info)
    nabla_log_ratio_list = list(nabla_log_ratio_dict.values())
    ens_abs_diff_list = list(ens_abs_diff_dict.values())
    ens_log_ratio_list = list(ens_log_ratio_dict.values())
    jsd_diff_list = list(jsd_diff_dict.values())
    ingroup_jsd_list = list(ingroup_jsd_dict.values())
    nabla_absdiff_list = list(nabla_absdiff_dict.values())
    ens_list = list(ens_dict.values())
    jsd_list = list(jsd_dict.values())
    

    return nabla_log_ratio_list, ens_log_ratio_list, ens_abs_diff_list, jsd_diff_list, ingroup_jsd_list, nabla_absdiff_list, ens_list, jsd_list


In [18]:
import os

species_names = {}
for path in gene_paths:
    gene_name = os.path.basename(path.rstrip('/'))
    triads_data_path = os.path.join(path, 'triples_info_dict.json')
    triads_info = load_json_data(triads_data_path)
    ingroup_names = get_ingroup_names(triads_info)
    species_names[gene_name] = ingroup_names

repeated_names_dict = {}
for gene, species in species_names.items():
    repeated_names_dict[gene] = []  # Initialize a list to store pairs
    for identifier, ingroup in species.items():
        for identifier2, ingroup2 in species.items():
            if identifier < identifier2:  # Ensure each pair is only processed once
                if ingroup == ingroup2:
                    repeated_names_dict[gene].append((identifier, identifier2))

# Optional: Remove genes with no repeated pairs
repeated_names_dict = {gene: pairs for gene, pairs in repeated_names_dict.items() if pairs}


In [19]:
removed_identifier = {gene: [a[0] for a in repeated_names_dict[gene]] if gene in repeated_names_dict else [] for gene in species_names}

In [20]:
def get_names(triads_info):
    names_dict = {}
    for identifier, info in triads_info.items():
        triads_names = info['triples_species_names']
        names_dict[identifier] = triads_names
    return names_dict

In [21]:
# Function to compute required values
def compute_values(path):
    gene_name = os.path.basename(path.rstrip('/'))

    triads_data_path = os.path.join(path, 'triples_info_dict.json')
    triads_info_original = load_json_data(triads_data_path)
    triads_info = {k: v for k, v in triads_info_original.items() if k not in removed_identifier[gene_name]}
    nabla_log_ratio_dict = get_ingroup_nabla(triads_info)
    ens_log_ratio_dict = get_ingroup_ens_diff(triads_info)
    ens_abs_diff_dict = get_ingroup_ens_absdiff(triads_info)
    jsd_diff_dict = get_jsd_diff(triads_info)
    ens_dict = get_ens(triads_info)
    jsd_dict = get_jsd(triads_info)
    ingroup_jsd_dict = get_ingroup_jsd(triads_info)
    nabla_absdiff_dict = get_nabla_absdiff(triads_info)
    species_names_dict = get_names(triads_info)
    nabla_log_ratio_list = list(nabla_log_ratio_dict.values())
    ens_abs_diff_list = list(ens_abs_diff_dict.values())
    ens_log_ratio_list = list(ens_log_ratio_dict.values())
    jsd_diff_list = list(jsd_diff_dict.values())
    ingroup_jsd_list = list(ingroup_jsd_dict.values())
    nabla_absdiff_list = list(nabla_absdiff_dict.values())
    ens_list = list(ens_dict.values())
    jsd_list = list(jsd_dict.values())
    species_names_list = list(species_names_dict.values())
    

    return nabla_log_ratio_list, ens_log_ratio_list, ens_abs_diff_list, jsd_diff_list, ingroup_jsd_list, nabla_absdiff_list, ens_list, jsd_list, species_names_list

In [22]:
# Initialize the dictionary to store the data
gene_data_dict = {}
# Populate the dictionary with data for each gene
for path in gene_paths:
    gene_name = os.path.basename(path.rstrip('/'))
    nabla_log_ratio_list, ens_log_ratio_list, ens_abs_diff_list, jsd_diff_list, ingroup_jsd_list, nabla_absdiff_list, ens_list, jsd_list, species_names_list = compute_values(path)
    gene_data_dict[gene_name] = {
        'nabla_log_ratio': nabla_log_ratio_list,
        'ens_log_ratio': ens_log_ratio_list,
        'ens_abs_diff': ens_abs_diff_list,
        'jsd_diff': jsd_diff_list, 
        'ingroup_jsd': ingroup_jsd_list,
        'nabla_absdiff': nabla_absdiff_list,
        'ens': ens_list,
        'jsd': jsd_list,
        'species_names': species_names_list
    }

In [23]:
ens_a_list = []
invalid_triple_list = []
for path in gene_paths:
    gene_name = os.path.basename(path.rstrip('/'))
    triads_data_path = os.path.join(path, 'triples_info_dict.json')
    triads_info = load_json_data(triads_data_path)
    for identifier, info in triads_info.items():
        ens_dict = info['triples_info_small_tree']['ens']
        for value in ens_dict.values():
            ens_a_list.append(value)
            if value > 1:
                invalid_triple_list.append((gene_name, identifier))

invalid_triple_list
    

[]

Does species with higher JAD also evolve faster? 

In [25]:
correlation_list = []
p_value_list= []
for gene, lists in gene_data_dict.items():
    ens_list = []
    jad_list = []
    for ens in lists['ens']:
        ens_list.extend(list(ens.values()))
    for jad in lists['jsd']:
        jad_list.extend(list(jad.values()))
    #Add the correlation factor in the list
    cor, p_value = scipy.stats.spearmanr(ens_list, jad_list)
    correlation_list.append(cor)
    p_value_list.append(p_value)

In [28]:
gene_names = list(gene_data_dict.keys())
fig = go.Figure(go.Bar(
    x=correlation_list,
    y=gene_names,
    orientation='h',  # Horizontal bar chart
    marker=dict(color=p_value_list, coloraxis="coloraxis")  # Color based on the correlation factor
))


# Update layout for better visualization
fig.update_layout(
    xaxis_title="Correlation Coefficient",
    yaxis_title="Gene ID",
    coloraxis=dict(colorscale='Viridis'),  # Red for higher, blue for lower values
    height=1200  # Adjust height based on the number of genes to avoid squeezing
)

fig.show()

In [29]:
ens_sum_list = []
for a in gene_data_dict['ENSG00000151461']['ens']:
    ens_sum_list.extend(list(a.values()))

jad_sum_list = []
for a in gene_data_dict['ENSG00000151461']['jsd']:
    jad_sum_list.extend(list(a.values()))

cor, p_value = scipy.stats.spearmanr(ens_sum_list, jad_sum_list)

import plotly.express as px
no_outliers_jad, no_outliers_ens = remove_outliers_iqr(jad_sum_list, ens_sum_list)
fig = px.scatter(x = no_outliers_jad, y = no_outliers_ens, labels={'x':'jad', 'y':'ens'}, title= None)
# Update layout with labels and title
fig.update_layout(
    template='plotly_white',
    margin=dict(l=20, r=20, t=50, b=20),
    autosize=True,
    yaxis_title_font={'size': 20},  
    xaxis_title_font={'size': 20}, 
    width=None 
)
fig.show()

In [31]:
cor

0.3785689374176399

Pearson Correlation Test Nabla Abs Diff Vs. ENS Abs Diff

In [34]:

correlation_list0 = []
p_value_list0 = []
for gene, lists in gene_data_dict.items():
    nabla_absdiff_list, ens_abs_diff_list = remove_outliers_iqr(lists['nabla_absdiff'], lists['ens_abs_diff'])
    #Add the correlation factor in the list
    cor, p_value = scipy.stats.spearmanr(np.abs(list(nabla_absdiff_list)), np.abs(list(ens_abs_diff_list)))
    correlation_list0.append(cor)
    p_value_list0.append(p_value)

fig0 = px.box(correlation_list0, title= 'Pearson Correlation Test Nabla Abs Diff Vs. ENS Abs Diff')
# Update layout with labels and title
fig0.update_layout(
    template='plotly_white',
    margin=dict(l=20, r=20, t=50, b=20),
    autosize=True,
    yaxis_title_font={'size': 20},  
    xaxis_title_font={'size': 20}, 
    xaxis_title='Correlation Coefficient',
    yaxis_title='Value',
    width=None
)
fig0.show()
# fig1.write_image('nabla_ens_cc.pdf')

Pearson Correlation Test Nabla Log Ratio Vs. ENS Log Ratio

In [38]:

correlation_list1 = []
p_value_list1 = []
for gene, lists in gene_data_dict.items():
    nabla_log_ratio_list, ens_log_ratio_list = remove_outliers_iqr(lists['nabla_log_ratio'], lists['ens_log_ratio'])
    #Add the correlation factor in the list
    cor, p_value = scipy.stats.spearmanr(np.abs(list(ens_log_ratio_list)), np.abs(list(nabla_log_ratio_list)))
    correlation_list1.append(cor)
    p_value_list1.append(p_value)

fig1 = px.box(correlation_list1, title= 'Pearson Correlation Test Nabla Log Ratio Vs. ENS Log Ratio')
# Update layout with labels and title
fig1.update_layout(
    template='plotly_white',
    margin=dict(l=20, r=20, t=50, b=20),
    autosize=True,
    yaxis_title_font={'size': 20},  
    xaxis_title_font={'size': 20}, 
    xaxis_title='Correlation Coefficient',
    yaxis_title='Value',
    width=None
)
fig1.show()
# fig1.write_image('nabla_ens_cc.pdf')

In [40]:
count = len([x for x in correlation_list1 if abs(x) > 0.3])
count

18

In [41]:
count = len([x for x in p_value_list1 if x < 0.05])
count

71

# Pearson Correlation Test JAD Difference Vs. ENS difference

In [190]:
import pandas as pd
species_number_dict = {}
for gene, value in gene_data_dict.items():
    data_f = pd.DataFrame({
        'ens_abs_diff': np.sqrt(value['ens_abs_diff']),
        'jsd_diff': np.sqrt(value['jsd_diff']),
        'Species1': [x['ingroup1'] for x in value['species_names']],
        'Species2': [x['ingroup2'] for x in value['species_names']],
        'Species3': [x['outgroup'] for x in value['species_names']]
    })

    data_long = pd.melt(
        data_f,
        id_vars=['ens_abs_diff', 'jsd_diff'],
        value_vars=['Species1', 'Species2', 'Species3'],
        var_name='Species_Position',
        value_name='Species'
    )
    data_long['ens_abs_diff'] = data_long['ens_abs_diff']
    data_long['jsd_diff'] = data_long['jsd_diff']
    data_long['Species'] = data_long['Species'].astype(str)
    data_long['Species'] = data_long['Species'].astype('category')

    species_number_dict[gene] = len(set(data_long['Species']))
    
correlation_list3 = {}
p_value_list3 = {}
for gene, lists in gene_data_dict.items():
    jsd_diff_list, ens_abs_diff_list = remove_outliers_iqr(lists['jsd_diff'], lists['ens_abs_diff'])

    #Add the correlation factor in the list
    cor, p_value = scipy.stats.spearmanr(jsd_diff_list, ens_abs_diff_list)
    correlation_list3[gene] = cor
    p_value_list3[gene] = p_value 

corrected_p_value_jad = {gene: p_value_list3[gene]*species_number_dict[gene] for gene in gene_data_dict.keys()}

significant_genes_corrected = [gene for gene in gene_data_dict.keys() if corrected_p_value_jad[gene] < 0.05]
significant_genes_correlation_dict = {gene: correlation_list3[gene] for gene in significant_genes_corrected}

# Step 2: Apply Benjamini-Hochberg procedure
# Create a DataFrame with genes and p-values
results_df1 = pd.DataFrame({
    'Gene': list(correlation_list3.keys()),
    'Observed_Correlation': list(correlation_list3.values()),
    'P_Value': list(corrected_p_value_jad.values())
})

# Remove genes with NaN p-values
results_df1 = results_df1.dropna(subset=['P_Value'])

# Sort by p-value
results_df1 = results_df1.sort_values('P_Value')

# Number of tests
m1 = len(results_df1)

# Desired FDR level
alpha = 0.05

# Rank the p-values
results_df1['Rank'] = np.arange(1, m1+1)

# Calculate the BH critical values
results_df1['BH_Critical'] = results_df1['Rank'] / m1 * alpha

# Determine significance
results_df1['BH_Significant'] = results_df1['P_Value'] <= results_df1['BH_Critical']

# Find the largest p-value that is significant
significant_results1 = results_df1[results_df1['BH_Significant']]

if not significant_results1.empty:
    max_rank = significant_results1['Rank'].max()
    # All p-values up to max_rank are significant
    results_df1['BH_Final_Significant'] = results_df1['Rank'] <= max_rank
else:
    results_df1['BH_Final_Significant'] = False

# Display significant results
significant_genes1 = results_df1[results_df1['BH_Final_Significant']]

significant_correlated_genes1 = significant_genes1[significant_genes1['Observed_Correlation'] > 0.2]

In [192]:
significant_correlated_genes1

Unnamed: 0,Gene,Observed_Correlation,P_Value,Rank,BH_Critical,BH_Significant,BH_Final_Significant
117,ENSG00000117114,0.835605,1.230200e-63,1,0.000385,True,True
27,ENSG00000162688,0.843212,8.305287e-61,2,0.000769,True,True
121,ENSG00000170242,0.863528,1.349069e-59,3,0.001154,True,True
54,ENSG00000065613,0.841349,6.000199e-52,4,0.001538,True,True
123,ENSG00000143493,0.751549,7.664171e-46,5,0.001923,True,True
...,...,...,...,...,...,...,...
99,ENSG00000133056,0.259195,1.192097e-02,85,0.032692,True,True
17,ENSG00000058668,0.232437,1.198305e-02,86,0.033077,True,True
98,ENSG00000075151,0.234532,1.936494e-02,87,0.033462,True,True
21,ENSG00000174576,0.271797,2.301740e-02,88,0.033846,True,True


In [194]:
fig2 = px.box(correlation_list3.values(), title= 'Pearson Correlation Test JAD Difference Vs. ENS difference')
# Update layout with labels and title
fig2.update_layout(
    template='plotly_white',
    margin=dict(l=20, r=20, t=50, b=20),
    autosize=True,
    yaxis_title_font={'size': 20},  
    xaxis_title_font={'size': 20}, 
    xaxis_title='Correlation Coefficient',
    yaxis_title='Value',
    width=None
)
fig2.show()
# fig2.write_image('jsd_ens_diff_cc.pdf')

In [214]:
gene_names = list(results_df1['Gene'])
# Create the bar chart
fig = go.Figure(go.Bar(
    x=list(results_df1['Observed_Correlation']),
    y=gene_names,
    orientation='h',  # Horizontal bar chart
    marker=dict(color=list(significant_correlated_genes1['P_Value']), coloraxis="coloraxis")  # Color based on the correlation factor
))
fig.add_shape(
    type="line",
    x0=0.2, y0=0, x1=0.2, y1=len(gene_names),
    line=dict(color="yellow", width=3, dash="dashdot"),
)


# Update layout for better visualization
fig.update_layout(
    xaxis_title="Correlation Coefficient",
    yaxis_title="Gene ID",
    coloraxis=dict(colorscale='Viridis'),  # Red for higher, blue for lower values
    height=1500  # Adjust height based on the number of genes to avoid squeezing
)

fig.show()

In [212]:
import plotly.express as px

# Data for histogram
values3 = list(results_df1['Observed_Correlation'])

# Create the histogram with density normalization
fig2 = px.histogram(
    values3,
    labels={'x': 'Correlation Coefficient', 'y': 'Density'},
    title=None,
    color_discrete_sequence=['#F4A300'],  # Set the color to a shade of orange
    histnorm='probability'  # Normalize the histogram to density
)   

# Update layout for presentation
fig2.update_layout(
    template='plotly_white',
    margin=dict(l=50, r=50, t=50, b=50),  # Adjust margins for a balanced look
    autosize=True,
    yaxis_title='<b>Probability</b>',  # Explicit y-axis title
    xaxis_title='<b>Correlation Coefficient (Spearman)</b>',  # Explicit x-axis title
    yaxis_title_font=dict(size=22),  # Adjust y-axis font size
    xaxis_title_font=dict(size=22),  # Adjust x-axis font size
    font=dict(size=18),  # General font size for labels and titles
    width=800,  # Set figure width (optional for better control)
    height=500,  # Set figure height (optional for better control)
    showlegend=False  # Remove the legend
)

fig2.add_shape(
    type="line",
    x0=0.2, y0=0, x1=0.2, y1=0.2,
    line=dict(color="red", width=4, dash="dashdot"),
)

# Set transparency level and add a solid line around each bar
fig2.update_traces(
    opacity=0.8,  # Set the transparency (0 = fully transparent, 1 = fully opaque)
    marker_line_color='black',  # Color of the line around each bar
    marker_line_width=1.5  # Width of the line around each bar
)


fig2.show()


fig2.write_image('/Users/gulugulu/repos/PuningAnalysis/results/figures/correlation_coefficient.pdf')


In [215]:
correlation_list3

{'ENSG00000137962': 0.5086889323577,
 'ENSG00000122482': 0.707100654789097,
 'ENSG00000116688': 0.19255296324868645,
 'ENSG00000122218': 0.03187622392335205,
 'ENSG00000054523': 0.03491225946607122,
 'ENSG00000030066': 0.31915613147914035,
 'ENSG00000177888': 0.3553500230545987,
 'ENSG00000117707': 0.07924470839043704,
 'ENSG00000122483': 0.15047452102022202,
 'ENSG00000138119': 0.4712037952032477,
 'ENSG00000107611': 0.2731737966610338,
 'ENSG00000149256': 0.2637244577227469,
 'ENSG00000085999': 0.5584775793921222,
 'ENSG00000187122': 0.43298106722994617,
 'ENSG00000135372': 0.3017929353998365,
 'ENSG00000197147': 0.6203589338624874,
 'ENSG00000148948': -0.0003109533310875593,
 'ENSG00000058668': 0.232437108708025,
 'ENSG00000116991': 0.48877073141629884,
 'ENSG00000165494': 0.11793556967637063,
 'ENSG00000066135': 0.15702377818724228,
 'ENSG00000174576': 0.2717970652102389,
 'ENSG00000148516': 0.6642292205924647,
 'ENSG00000163399': 0.41696608210499286,
 'ENSG00000129173': 0.51865254

## Mixed linear model with random effect

In [118]:
# Example structure of your data
import pandas as pd
import statsmodels.formula.api as smf

def get_smf_mixedlm_result(gene, gene_data_dict):
    # Step 1: Get the data for the gene
    value = gene_data_dict[gene]
    data_f = pd.DataFrame({
        'ens_abs_diff': np.sqrt(value['ens_abs_diff']),
        'jsd_diff': np.sqrt(value['jsd_diff']),
        'Species1': [x['ingroup1'] for x in value['species_names']],
        'Species2': [x['ingroup2'] for x in value['species_names']],
        'Species3': [x['outgroup'] for x in value['species_names']]
    })

    data_long = pd.melt(
        data_f,
        id_vars=['ens_abs_diff', 'jsd_diff'],
        value_vars=['Species1', 'Species2', 'Species3'],
        var_name='Species_Position',
        value_name='Species'
    )

    # Step 3: Adjust variables
    data_long['ens_abs_diff'] = data_long['ens_abs_diff'] / 3
    data_long['jsd_diff'] = data_long['jsd_diff'] / 3

    # Step 4: Convert species identifiers to strings and factors
    data_long['Species'] = data_long['Species'].astype(str)
    data_long['Species'] = data_long['Species'].astype('category')

    model = smf.mixedlm('ens_abs_diff ~ jsd_diff', data=data_long, groups=data_long['Species'])
    result = model.fit()

    var_random = result.cov_re.iloc[0, 0]

    # Fixed effects variance
    # Calculate variance of the linear predictor (fixed effects)
    fixed_effects = result.fe_params
    X = result.model.exog
    var_fixed = np.var(np.dot(X, fixed_effects))

    # Residual variance
    var_residual = result.scale

    # Marginal R-squared (fixed effects only)
    R_m2 = var_fixed / (var_fixed + var_random + var_residual)

    # Conditional R-squared (fixed + random effects)
    R_c2 = (var_fixed + var_random) / (var_fixed + var_random + var_residual)

    return result, R_m2, R_c2

In [56]:
multiple_linear_regression_data = {'gene': {}, 'p_value':{}, 'marginal_r^2': {}, 'conditional_r^2': {}}
for gene, value in gene_data_dict.items():
    data_f = pd.DataFrame({
        'ens_abs_diff': np.sqrt(value['ens_abs_diff']),
        'jsd_diff': np.sqrt(value['jsd_diff']),
        'Species1': [x['ingroup1'] for x in value['species_names']],
        'Species2': [x['ingroup2'] for x in value['species_names']],
        'Species3': [x['outgroup'] for x in value['species_names']]
    })

    data_long = pd.melt(
        data_f,
        id_vars=['ens_abs_diff', 'jsd_diff'],
        value_vars=['Species1', 'Species2', 'Species3'],
        var_name='Species_Position',
        value_name='Species'
    )

    # Step 3: Adjust variables
    data_long['ens_abs_diff'] = data_long['ens_abs_diff'] / 3
    data_long['jsd_diff'] = data_long['jsd_diff'] / 3

    # Step 4: Convert species identifiers to strings and factors
    data_long['Species'] = data_long['Species'].astype(str)
    data_long['Species'] = data_long['Species'].astype('category')

    model = smf.mixedlm('ens_abs_diff ~ jsd_diff', data=data_long, groups=data_long['Species'])
    result = model.fit()

        # Random effects variance
    var_random = result.cov_re.iloc[0, 0]

    # Fixed effects variance
    # Calculate variance of the linear predictor (fixed effects)
    fixed_effects = result.fe_params
    X = result.model.exog
    var_fixed = np.var(np.dot(X, fixed_effects))

    # Residual variance
    var_residual = result.scale

    # Marginal R-squared (fixed effects only)
    R_m2 = var_fixed / (var_fixed + var_random + var_residual)

    # Conditional R-squared (fixed + random effects)
    R_c2 = (var_fixed + var_random) / (var_fixed + var_random + var_residual)

    multiple_linear_regression_data['gene'][gene] = gene
    multiple_linear_regression_data['p_value'][gene] = result.pvalues['jsd_diff']
    multiple_linear_regression_data['marginal_r^2'][gene] = R_m2
    multiple_linear_regression_data['conditional_r^2'][gene] = R_c2



The MLE may be on the boundary of the parameter space.


The MLE may be on the boundary of the parameter space.


Maximum Likelihood optimization failed to converge. Check mle_retvals


Retrying MixedLM optimization with lbfgs


Maximum Likelihood optimization failed to converge. Check mle_retvals


Retrying MixedLM optimization with cg


Maximum Likelihood optimization failed to converge. Check mle_retvals


MixedLM optimization failed, trying a different optimizer may help.


Gradient optimization failed, |grad| = 3.625205


The MLE may be on the boundary of the parameter space.


Maximum Likelihood optimization failed to converge. Check mle_retvals


Retrying MixedLM optimization with lbfgs


Maximum Likelihood optimization failed to converge. Check mle_retvals


Retrying MixedLM optimization with cg


Maximum Likelihood optimization failed to converge. Check mle_retvals


MixedLM optimization failed, trying a different optimizer may help.


Gradient optimization failed, |grad| = 1

In [150]:
model_result_test, marginal_r_2, conditional_r_2 = get_smf_mixedlm_result('ENSG00000137962', gene_data_dict)


The MLE may be on the boundary of the parameter space.



In [151]:
random_effect = model_result_test.random_effects
random_effect_list = [random_effect[species]['Group'] for species in random_effect.keys()]
predicted_ens = model_result_test.fittedvalues

In [156]:
fig_predicted_observed_ens = go.Figure()
fig_predicted_observed_ens.add_trace(go.Scatter(x=predicted_ens, y= np.sqrt(gene_data_dict['ENSG00000137962']['ens_abs_diff'])/3, mode='markers'))
fig_predicted_observed_ens.update_layout(title='Predicted vs Observed ENS', xaxis_title='Predicted ENS', yaxis_title='Observed ENS')

In [126]:
px.histogram(random_effect_list, title='Random Effects Distribution')

In [77]:
multiple_linear_regression_data['r^2_difference'] = {gene: multiple_linear_regression_data['conditional_r^2'][gene] - multiple_linear_regression_data['marginal_r^2'][gene] for gene in multiple_linear_regression_data['gene']}

In [78]:
r_squared_data = pd.DataFrame(multiple_linear_regression_data)

In [79]:
r_squared_data

Unnamed: 0,gene,p_value,marginal_r^2,conditional_r^2,r^2_difference
ENSG00000137962,ENSG00000137962,6.616652e-48,0.241683,0.355674,0.113992
ENSG00000122482,ENSG00000122482,9.734192e-268,0.667824,0.716112,0.048288
ENSG00000116688,ENSG00000116688,6.137562e-09,0.043260,0.099194,0.055934
ENSG00000122218,ENSG00000122218,8.950968e-06,0.031721,0.124582,0.092861
ENSG00000054523,ENSG00000054523,4.118705e-11,0.054168,0.184845,0.130677
...,...,...,...,...,...
ENSG00000162775,ENSG00000162775,4.027469e-03,0.015258,0.185814,0.170556
ENSG00000023839,ENSG00000023839,2.620751e-10,0.074492,0.264085,0.189594
ENSG00000135829,ENSG00000135829,1.883872e-28,0.168058,0.279645,0.111587
ENSG00000177853,ENSG00000177853,6.335258e-13,0.107783,0.264515,0.156732


In [81]:
import plotly.graph_objects as go
import pandas as pd

# Create a stacked plot using Plotly




In [209]:
import plotly.graph_objects as go
import pandas as pd

# Create a stacked plot using Plotly

fig = go.Figure(go.Bar(
    y=r_squared_data['gene'],
    x=r_squared_data['r^2_difference'],
    orientation='h',  # Horizontal bar chart
    marker_color='lightsalmon',
))

# fig.add_shape(
#     type="line",
#     x0=0.1, y0=0, x1=0.1, y1=len(gene_names),
#     line=dict(color="yellow", width=3, dash="dashdot"),
# )

# # Update layout for stacked bar chart
# fig.update_layout(
#     barmode='stack',

#     showlegend=True
# )

# Update layout for better visualization
fig.update_layout(
    title='Difference between Marginal R² and Conditional R² for Each Gene',
    yaxis_title='Gene',
    xaxis_title='R² Value',
    legend_title='R² Type',
    template='plotly_white',
    margin=dict(l=60, r=30, t=80, b=60),
    height=1500  # Adjust height based on the number of genes to avoid squeezing
)

fig.show()



In [202]:
# Create Bar traces for Marginal R² and Conditional R²
trace_marginal = go.Bar(
    x=r_squared_data.index,  # Use the DataFrame index for x-axis positions
    y=r_squared_data['marginal_r^2'],
    name='Marginal R²',
    marker_color='indianred',
    hovertext=r_squared_data['gene'],  # Show gene names on hover
    hoverinfo='text+y'  # Display gene name and R² value on hover
)

trace_conditional = go.Bar(
    x=r_squared_data.index,  # Use the DataFrame index for x-axis positions
    y=r_squared_data['conditional_r^2'],
    name='Conditional R²',
    marker_color='lightsalmon',
    hovertext=r_squared_data['gene'],  # Show gene names on hover
    hoverinfo='text+y'  # Display gene name and R² value on hover
)

# Combine the traces
data_traces = [trace_marginal, trace_conditional]

# Define the layout
layout = go.Layout(
    title='Marginal vs Conditional R² by Gene',
    xaxis=dict(
        title='Gene',
        # tickvals=r_squared_data.index,  # Position ticks based on index
        # ticktext=[''] * len(r_squared_data),  # Remove gene names by setting empty labels
        showticklabels=False,  # Completely hide tick labels
        tickfont=dict(size=14)
    ),
    yaxis=dict(
        title='R² Value',
        tickfont=dict(size=14),
        range=[0, 1]  # Assuming R² ranges between 0 and 1
    ),
    barmode='group',  # Side-by-side bars
    bargap=0.15,      # Gap between groups of bars
    bargroupgap=0.1,  # Gap between bars within a group
    legend=dict(
        x=0.85,
        y=1.15,
        bgcolor='rgba(255,255,255,0)',
        bordercolor='rgba(255,255,255,0)'
    ),
    template='plotly_white',
    margin=dict(l=60, r=30, t=80, b=60)
)

# Create the figure
fig = go.Figure(data=data_traces, layout=layout)

# Display the plot
fig.show()

In [85]:
len([x for x in multiple_linear_regression_data['marginal_r^2'].values() if x > 0.1])

91

In [86]:
len([x for x in multiple_linear_regression_data['conditional_r^2'].values() if x > 0.1])

126

In [63]:
# from statsmodels.stats.stattools import durbin_watson

# y = gene_data_dict['ENSG00000030066']['ens_abs_diff']
# y_pred = gene_data_dict['ENSG00000030066']['jsd_diff']
# # Durbin-Watson test for residuals
# dw_stat = durbin_watson(y - y_pred)

Pearson Correlation Test In-group JSD Vs. ENS difference

In [64]:
correlation_list4 = {}
p_value_list4 = {}
for gene, lists in gene_data_dict.items():
    ingroup_jsd_list, ens_abs_diff_list = remove_outliers_iqr(lists['ingroup_jsd'], lists['ens_abs_diff'])

    #Add the correlation factor in the list
    cor, p_value = scipy.stats.spearmanr(ingroup_jsd_list, ens_abs_diff_list)
    correlation_list4[gene] = cor
    p_value_list4[gene] = p_value 

fig3 = px.box(correlation_list4.values(), title= 'Pearson Correlation Test In-group JSD Vs. ENS difference')
# Update layout with labels and title
fig3.update_layout(
    template='plotly_white',
    margin=dict(l=20, r=20, t=50, b=20),
    autosize=True,
    yaxis_title_font={'size': 20},  
    xaxis_title_font={'size': 20}, 
    xaxis_title='Correlation Coefficient',
    yaxis_title='Value',
    width=None
)
fig3.show()
# fig2.write_image('jsd_ens_diff_cc.pdf')



In [66]:
count = len([x for x in p_value_list4.values() if x < 0.05])
count

113

In [69]:
count = len([x for x in correlation_list4.values() if x > 0.2])
count

96

## Scatter plot for each gene

In [None]:


def plot_gene_data(gene_data_dict, xcol, ycol):
    keys = list(gene_data_dict.keys())
    rows = int(len(keys) ** 0.5) + 1  # Calculate the number of rows for subplots
    cols = (len(keys) + rows - 1) // rows  # Calculate the number of columns

    fig = make_subplots(rows=rows, cols=cols, subplot_titles=[f'{key}' for key in keys])
    
    # Populate subplots
    for index, key in enumerate(keys, start=1):
        gene_data = gene_data_dict[key]
        x_value, y_value = remove_outliers_iqr(gene_data[xcol], gene_data[ycol])

        row = (index - 1) // cols + 1
        col = (index - 1) % cols + 1
        
        fig.add_trace(
            go.Scatter(
                x=x_value,
                y=y_value,
                mode='markers',
                name=f'{key}'
            ),
            row=row,
            col=col
        )
        
        # Adding a trend line
        fig.add_trace(
            go.Scatter(
                x=x_value,
                y=np.poly1d(np.polyfit(x_value, y_value, 1))(x_value),
                mode='lines',
                name=f'Trend {key}',
                line=dict(color='red')
            ),
            row=row,
            col=col
        )
        
        # Update axis properties
        fig.update_xaxes(title_text=xcol if row == rows else "", row=row, col=col)
        fig.update_yaxes(title_text=ycol if col == 1 else "", row=row, col=col)
    
    fig.update_layout(
        height=300 * rows,  # Set a reasonable height based on the number of rows
        width=300 * cols,   # Set a reasonable width based on the number of columns
        showlegend=False
    )
    
    return fig

In [None]:
# Usage example
fig = plot_gene_data(gene_data_dict, 'jsd_diff', 'ens_abs_diff')

fig.update_layout(
title_text="Scatter Plots of JAD Difference vs. ENS Difference",)
fig.show()

In [None]:
path = '/Users/gulugulu/Desktop/honours/data_local_2/triples_model_fitting_550_threshold//ENSG00000048707'


In [None]:
triads_data_path = os.path.join(path, 'triples_info_dict.json')
triads_info = load_json_data(triads_data_path)
ens_ingroup_list = []
ingroup_jsd_list = []
for identifier, info in triads_info.items():
    triads_info_value = info['triples_info_small_tree']
    triads_names = info['triples_species_names']
    ingroup_jsd = triads_info_value['ingroup_jsd']
    ens_dict = triads_info_value['ens']
    ens_ingroup = abs(ens_dict[triads_names['ingroup1']] - ens_dict[triads_names['ingroup2']])
    ens_ingroup_list.append(ens_ingroup)
    ingroup_jsd_list.append(ingroup_jsd)


ens_ingroup_list2, ingroup_jsd_list2 = remove_outliers_iqr(ens_ingroup_list, ingroup_jsd_list)


In [None]:
indices_ens_diff2 = list(range(1, len(ens_ingroup_list2) + 1))
indices_ingroup_jsd2 = list(range(1, len(ingroup_jsd_list2) + 1))

list_pair = []
for i in range(len(indices_ens_diff2)):
    jsd_value, ens = ingroup_jsd_list2[i], ens_ingroup_list2[i]
    list_pair.append((jsd_value, ens))

# Create the scatter plot
fig1 = go.Figure()

# Add ENS Differences scatter plot
fig1.add_trace(go.Scatter(
    x=indices_ingroup_jsd2, y=ens_ingroup_list2, mode='markers',
    marker=dict(size=4), name='ENS Differences'))

fig1.add_trace(go.Scatter(
    x=indices_ingroup_jsd2, y=ingroup_jsd_list2, mode='markers',
    marker=dict(size=4), name='Ingroup JSD'))


# Update layout for clear visualization
fig1.update_layout(
    title='Uniform distirbution of each property',
    xaxis_title='Index',
    yaxis_title='Values',
    showlegend=True,
    
)

fig1.show()

In [None]:
sorted_data = sorted(list_pair, key=lambda x: x[1])

In [None]:
x_values = [x[0] for x in sorted_data]
y_values = [y[0] for y in sorted_data]

In [None]:
# Create the scatter plot
fig1 = go.Figure()

# Add ENS Differences scatter plot
fig1.add_trace(go.Scatter(
    x=indices_ingroup_jsd2, y=sorted(ens_ingroup_list2), mode='markers',
    marker=dict(size=4), name='ENS Differences'))

fig1.add_trace(go.Scatter(
    x=indices_ingroup_jsd2, y=sorted(ingroup_jsd_list2), mode='markers',
    marker=dict(size=4), name='Ingroup JSD'))


# Update layout for clear visualization
fig1.update_layout(
    title='Uniform distirbution of each property',
    xaxis_title='Index',
    yaxis_title='Values',
    showlegend=True,
    
)

fig1.show()

In [None]:
import numpy as np
import plotly.graph_objects as go
from scipy.stats import uniform

def qq_plot_uniform(data):
    """
    Creates a QQ plot of the provided data against a uniform distribution using Plotly.

    Args:
    data (array-like): The dataset to plot.
    a (float): The lower bound of the uniform distribution (default 0).
    b (float): The upper bound of the uniform distribution (default 1).
    """
    
    # Scale data for the specified uniform range
    data = np.array(data)
    data.sort()  # Sort the data for plotting

    a = min(data)
    b = max(data)
    scaled_data = (data - a) / (b - a)

    # Calculate quantiles
    n = len(data)
    theoretical_quantiles = uniform.ppf(np.arange(1, n + 1) / (n + 1))

    # Create a QQ plot
    fig = go.Figure()

    # Adding scatter plot for QQ plot
    fig.add_trace(go.Scatter(x=theoretical_quantiles, y=scaled_data, mode='markers',
                             name=None,
                             marker=dict(color='blue')))

    # Add line of perfect fit
    fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines',
                             name=None,
                             line=dict(color='red', dash='dash')))

    # Update layout
    fig.update_layout(
                      xaxis_title='Uniform Quantiles',
                      yaxis_title='Sample Quantiles',
                      width=600, height=600)

    # Show plot
    fig.show()

In [None]:
path_jsd = '/Users/gulugulu/Desktop/honours/data_local_2/triples_550_threshold/ENSG00000009307/jsd_bins.json'
with open(path_jsd, 'r') as file:
    jsd_bins = json.load(file)

In [None]:
path_ens = '/Users/gulugulu/Desktop/honours/data_local_2/triples_550_threshold/ENSG00000009307/ens_diff_bins.json'
with open(path_ens, 'r') as file:
    ens_bins = json.load(file)

In [None]:
jsd_binning_list = [value['triples_info_big_tree']['jsd_dict']['Ingroup_JSD'] for value in jsd_bins['bins'].values() if value is not None]
ens_binning_list = [value['triples_info_big_tree']['ens_difference'] for value in ens_bins['bins'].values() if value is not None]

In [None]:
qq_plot_uniform(ingroup_jsd_list)

In [None]:
# Create the scatter plot
fig1 = go.Figure()

# Add ENS Differences scatter plot
fig1.add_trace(go.Scatter(
    x=np.sqrt(x_values), y=sorted(y_values), mode='markers',
    marker=dict(size=4), name='ENS Differences'))



# Update layout for clear visualization
fig1.update_layout(
    title='Uniform distirbution of each property',
    xaxis_title='Index',
    yaxis_title='Values',
    showlegend=True,
    
)

fig1.show()

In [None]:
import numpy as np
import scipy.stats as stats

def permute_test_correlation(x, y, n_permutations=10000):
    # Calculate the actual correlation
    actual_corr, _ = stats.pearsonr(x, y)
    
    # To hold the permuted correlations
    permuted_corrs = []

    original_y = np.copy(y)
    
    # Permutation test
    for _ in range(n_permutations):
        # Shuffle one of the lists
        np.random.shuffle(y)
        # Compute the correlation of the permuted data
        perm_corr, _ = stats.pearsonr(x, y)
        permuted_corrs.append(perm_corr)

        y[:] = original_y
    
    # Compute p-value: proportion of permuted correlations as extreme as the actual one
    p_value = np.sum(np.abs(permuted_corrs) >= np.abs(actual_corr)) / n_permutations
    
    return actual_corr, p_value, permuted_corrs





# Factor impacts the correlation analysis

Sequence composition analysis

In [None]:
import os
import json

def extract_internal_root_distribution(base_path):
    gene_dirs = [d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d))]
    all_data = {}

    for gene_id in gene_dirs:
        gene_path = os.path.join(base_path, gene_id)
        internal_root_distribution = {}

        # Files to process
        path = os.path.join(gene_path, 'triples_info_dict.json')
        if os.path.exists(path):
            with open(path, 'r') as f:
                data = json.load(f)
                for identifier, content in data.items():
                    if content: 
                        triads_info = content['triples_info_small_tree']
                        nuc_freqs_dict = triads_info['nuc_freqs_dict']
                        internal_root_info = nuc_freqs_dict["internal_node"]
                        internal_root_distribution[str(identifier)] = internal_root_info

        all_data[gene_id] = internal_root_distribution

    return all_data

# Base directory containing all gene ID folders
base_path = '/Users/gulugulu/Desktop/honours/data_local_2/triples_model_fitting_550_threshold'
result = extract_internal_root_distribution(base_path)

information_dict = {}
for gene_id, distirbutions_info in result.items():
    information_dict[gene_id] = {}
    for identifier, distirbution in distirbutions_info.items():
        information_dict[gene_id][identifier] = calculate_information(distirbution)

# # Optionally, save this data to a file
# output_file = os.path.join(base_path, 'internal_root_distributions.json')
# with open(output_file, 'w') as f:
#     json.dump(result, f, indent=4)

In [None]:
average_information_dict = {}
for gene, informations in information_dict.items():
    average_information_dict[gene] = np.average(list(informations.values()))

In [None]:
average_distribution_dir = {}
for gene, distributions in result.items():
    average_distribution_dir[gene] = np.mean(np.array(list(distributions.values())), axis=0)

In [None]:
average_info_dict = {gene: calculate_information(distribution) for gene, distribution in average_distribution_dir.items()}

In [None]:
import plotly.express as px
corr_list_sorted = {gene: correlation_list3[gene] for gene in average_info_dict.keys()}
fig = px.scatter(x = average_info_dict.values(), y = corr_list_sorted.values(), labels={'x':'Information', 'y':'Correlation Coefficient'}, title= None)
# Update layout with labels and title
fig.update_layout(
    template='plotly_white',
    margin=dict(l=20, r=20, t=50, b=20),
    autosize=True,
    yaxis_title_font={'size': 20},  
    xaxis_title_font={'size': 20}, 
    width=None 
)
fig.show()

Genome length analysis

In [319]:
from cogent3 import open_data_store, get_app
load_json_app = get_app("load_json")

big_tree_res_dir = '/Users/gulugulu/Desktop/honours/data_local_2/whole_gene_model_fitting_550_threshold'

def extract_big_tree_info(base_path):
    ens_info_dict = {}
    result_input_dstore = open_data_store(base_path, suffix='json', mode='r')
    for path in result_input_dstore:
        trace_back_nodes = {}
        gene_name = path.unique_id.split('.')[0]
        print(gene_name)
        res_lf = load_json_app(path).lf
        tip_names = res_lf.tree.get_tip_names()
        for speices in tip_names:
            trace_back_path = []
            for tree in res_lf.tree.get_connecting_edges('root', speices):
                internal_root = tree.get_node_names()[0]
                trace_back_path.append(internal_root)
            trace_back_nodes[speices] = trace_back_path
        ens_dict = {species: np.sum([res_lf.get_ens_tree().to_rich_dict()['edge_attributes'][n]['length'] for n in trace_back_nodes[species][1:]]) for species in res_lf.tree.get_tip_names()}
        ens_info_dict[gene_name] = ens_dict
        print('finished')

    return ens_info_dict

In [320]:
ens_big_tree_info_dir = extract_big_tree_info(big_tree_res_dir)

ENSG00000158636
finished
ENSG00000122482
finished
ENSG00000148948
finished
ENSG00000198216
finished
ENSG00000110422
finished
ENSG00000129173
finished
ENSG00000116688
finished
ENSG00000163399
finished
ENSG00000116128
finished
ENSG00000162402
finished
ENSG00000116539
finished
ENSG00000170242
finished
ENSG00000137474
finished
ENSG00000162711
finished
ENSG00000107611
finished
ENSG00000175216
finished
ENSG00000176986
finished
ENSG00000109927
finished
ENSG00000151461
finished
ENSG00000110497
finished
ENSG00000009307
finished
ENSG00000129083
finished
ENSG00000265203
finished
ENSG00000214655
finished
ENSG00000196878
finished
ENSG00000111642
finished
ENSG00000052841
finished
ENSG00000117000
finished
ENSG00000167986
finished
ENSG00000173662
finished
ENSG00000030066
finished
ENSG00000135829
finished
ENSG00000122483
finished
ENSG00000133816
finished
ENSG00000143702
finished
ENSG00000143493
finished
ENSG00000197147
finished
ENSG00000142661
finished
ENSG00000197106
finished
ENSG00000064309
finished


In [330]:
ens_big_tree_info_dir

{'ENSG00000158636': {'Elephant': 0.0620200346348242,
  'Armadillo': 0.10266207112037654,
  'Polar_bear': 0.2688072947855994,
  'Giant_panda': 0.1045324602249872,
  'Ferret': 0.11479784860105083,
  'Dingo': 0.1330613451237711,
  'Red_fox': 0.0991887826063681,
  'Cat': 0.09925595768157508,
  'Leopard': 0.09486309931804414,
  'Horse': 0.11819660019788508,
  'Narwhal': 0.05736135756623854,
  'Beluga_whale': 0.05736135756657993,
  'Vaquita': 0.058759617803643235,
  'Sperm_whale': 0.06597057073512767,
  'Blue_whale': 0.04837353057847224,
  'Goat': 0.10115710469520423,
  'Wild_yak': 0.10829890065671616,
  'American_bison': 0.1082885453076817,
  'Siberian_musk_deer': 0.10121064767131144,
  'Yarkand_deer': 0.10401476671664872,
  'Pig': 0.08121384365478963,
  'Chacoan_peccary': 0.090219905889285,
  'Arabian_camel': 0.05857229744536786,
  'Greater_horseshoe_bat': 0.08492333754608321,
  'Megabat': 0.05439423190289376,
  'Microbat': 0.05583809205446748,
  'Kangaroo_rat': 0.14449543560036782,
  'Ryu

In [328]:
ens_big_tree_info_average_ens_dir = {gene: np.mean(list(ens_info.values())) for gene, ens_info in ens_big_tree_info_dir.items()}
ens_big_tree_info_average_ens_dir_filtered = {gene: ens_big_tree_info_average_ens_dir[gene] for gene in ens_big_tree_info_average_ens_dir.keys() if ens_big_tree_info_average_ens_dir[gene] < 2 }
corr_list_length_sorted = {gene: correlation_list3[gene] for gene in ens_big_tree_info_average_ens_dir_filtered.keys()}

In [329]:
import plotly.express as px

fig = px.scatter(x = ens_big_tree_info_average_ens_dir_filtered.values(), y = corr_list_length_sorted.values(), labels={'x':'Average Length', 'y':'Correlation Coefficient'}, trendline="ols",title= None)
# Update layout with labels and title
fig.update_layout(
    template='plotly_white',
    margin=dict(l=20, r=20, t=50, b=20),
    autosize=True,
    yaxis_title_font={'size': 20},  
    xaxis_title_font={'size': 20}, 
    width=None 
)
fig.show()

In [323]:
del ens_big_tree_info_average_ens_dir['ENSG00000134250']
del ens_big_tree_info_average_ens_dir['ENSG00000110237']
del ens_big_tree_info_average_ens_dir['ENSG00000126705']
del ens_big_tree_info_average_ens_dir['ENSG00000214655']


In [324]:
import plotly.express as px
corr_list_length_sorted = {gene: correlation_list3[gene] for gene in ens_big_tree_info_average_ens_dir.keys()}
fig = px.scatter(x = ens_big_tree_info_average_ens_dir.values(), y = corr_list_length_sorted.values(), labels={'x':'Average Length', 'y':'Correlation Coefficient'}, trendline="ols", title= None)
# Update layout with labels and title
fig.update_layout(
    template='plotly_white',
    margin=dict(l=20, r=20, t=50, b=20),
    autosize=True,
    yaxis_title_font={'size': 20},  
    xaxis_title_font={'size': 20}, 
    width=None 
)
fig.show()

In [279]:
def extract_ens(base_path):
    gene_dirs = [d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d))]
    all_data = {}

    for gene_id in gene_dirs:
        gene_path = os.path.join(base_path, gene_id)
        ens_dict_values = {}

        # Files to process
        path = os.path.join(gene_path, 'triples_info_dict_new.json')
        if os.path.exists(path):
            with open(path, 'r') as f:
                data = json.load(f)
                for identifier, content in data.items():
                    if content: 
                        triads_info = content['triples_info_small_tree']
                        ens_dict = triads_info['ens']

                        ens_values = list(ens_dict.values())
                        internal_edge_length = ens_values[3]
                        ens_dict_values[str(identifier)] = [a + internal_edge_length for a in ens_values[0:2]]
        all_data[gene_id] = ens_dict_values

    return all_data

In [286]:
ens_info_dir = extract_ens(base_dir)

ens_info_average_small_tree_dir = {}
for gene, ens_info in ens_info_dir.items():
    ens_info_average_small_tree_dir[gene] = np.mean(np.array(list(ens_info.values())), axis=0)
overall_ens = {gene: np.mean(ens_info_average_small_tree_dir[gene]) for gene in ens_info_average_small_tree_dir.keys()}

In [287]:
import plotly.express as px
corr_list_length_sorted = {gene: correlation_list3[gene] for gene in overall_ens.keys()}
fig = px.scatter(x = overall_ens.values(), y = corr_list_length_sorted.values(), labels={'x':'Average in-group branch Length', 'y':'Correlation Coefficient'}, trendline="ols", title= None)
# Update layout with labels and title
fig.update_layout(
    template='plotly_white',
    margin=dict(l=20, r=20, t=50, b=20),
    autosize=True,
    yaxis_title_font={'size': 20},  
    xaxis_title_font={'size': 20}, 
    width=None 
)
fig.show()

In [288]:
cor, p = scipy.stats.pearsonr(list(overall_ens.values()), list(corr_list_length_sorted.values()))
cor, p

(-0.09394563483996106, 0.28771255327487877)