In [3]:
#parameter set up
import json
with open('../../../data/dataset1_16s-10_initial_alignments/matrix_motif.json', 'r') as file:
    matrix_motif = json.load(file)

In [37]:
with open('../../../results/length_3000.json', 'r') as file1:
    results_3000 = json.load(file1)

with open('../../../results/length_1000.json', 'r') as file2:
    results_1000 = json.load(file2)

with open('../../../results/length_2000.json', 'r') as file3:
    results_2000 = json.load(file3)

with open('../../../results/length_4000.json', 'r') as file4:
    results_4000 = json.load(file4)

In [None]:
time_range = [0.5, 1, 2, 3, 4]
time_range_ = [0.5, 1, 2, 3]

In [None]:
results_1000_new = {t: value for t, value in zip(time_range, results_1000.values())}
results_2000_new = {t: value for t, value in zip(time_range, results_2000.values())}
results_3000_new = {t: value for t, value in zip(time_range, results_3000.values())}
results_4000_new = {t: value for t, value in zip([0.5, 1, 2, 3], results_4000.values())}

In [None]:
from scipy import stats
import statistics

def get_all_stats(ns_dict, theoretical_ns_dict, time_range):
    
    combined_results = {}

    # Loop over each time point in the ns_dict
    for i, t in enumerate(time_range):
        ns_per_site_list = ns_dict[t]['ns_per_site_list']
        theoretical_mean = theoretical_ns_dict[t]

        # Calculate descriptive statistics
        average = statistics.mean(ns_per_site_list)
        std_dev = statistics.stdev(ns_per_site_list)
        cv = (std_dev / average) * 100

        # Perform t-test
        t_stat, p_value = stats.ttest_1samp(ns_per_site_list, theoretical_mean)

        # Store results in dictionary
        combined_results[t] = {
            'average': average,
            'theoretical_ns':theoretical_ns_dict[t],
            'standard_deviation': std_dev,
            'coefficient_of_variation': cv,
            't_stat': t_stat,
            'p_value': p_value
        }

    return combined_results



In [None]:
from cogent3.maths.matrix_exponential_integration import expected_number_subs

theoretical_ns_dict = {}
for t in time_range:
    theoretical_ns_dict[t] = expected_number_subs()



In [None]:
stats_1000 = get_all_stats(results_1000_new, theoretical_ns_dict, time_range)
stats_2000 = get_all_stats(results_2000_new, theoretical_ns_dict, time_range)
stats_3000 = get_all_stats(results_3000_new, theoretical_ns_dict, time_range)
stats_4000 = get_all_stats(results_4000_new, theoretical_ns_dict, time_range_)

In [None]:
ns_dict_all = {'1000': results_1000_new, '2000': results_2000_new, '3000': results_3000_new, '4000': results_4000_new}

In [None]:
import pandas as pd
stat_dict_all = {'1000': stats_1000, '2000': stats_2000, '3000': stats_3000, '4000': stats_4000}


In [None]:
# Creating a DataFrame
df_list = []
for length, times in stat_dict_all.items():
    for time, stats in times.items():
        df_list.append({'Length': int(length), 'Time': time, 'Theoretical': theoretical_ns_dict[time], 'Average': stats['average'], 'stdev':stats['standard_deviation'], 'cv': stats['coefficient_of_variation'], 't-value': stats['t_stat'], 'p-value': stats['p_value']})

df = pd.DataFrame(df_list)

In [None]:
import plotly.express as px
# Time vs. p-value for each sequence length
fig_time_pvalue = px.line(df, x='Time', y='p-value', color='Length', markers=True,
                          labels={'p-value': 'P-value', 'Time': 'Time'},
                          title='P-value vs. Time for Different Sequence Lengths')
fig_time_pvalue.add_hline(y=0.05, line_width=3, line_dash="dash", line_color="red")
fig_time_pvalue.write_image("P_value_length_time.pdf")

fig_time_pvalue.show()

  sf: grouped.get_group(s if len(s) > 1 else s[0])


In [None]:
import plotly.express as px
# Time vs. p-value for each sequence length
fig_time_pvalue = px.line(df, x='Time', y='cv', color='Length', markers=True,
                          labels={'cv': 'Coefficient of variation', 'Time': 'Time'},
                          title='Coefficient of variatio vs. Time for Different Sequence Lengths')
fig_time_pvalue.write_image("Stability_length_time.pdf")

fig_time_pvalue.show()





In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def get_histograms(ns_dict, theoretical_ns_list):
    lengths = list(ns_dict.keys())
    # Gather all unique time keys from each length
    all_times = {time for length in ns_dict for time in ns_dict[length].keys()}
    times = sorted(all_times)  # Sorting to maintain a consistent order
    
    rows = len(lengths)
    cols = len(times)
    
    # Create subplots
    fig = make_subplots(rows=rows, cols=cols, subplot_titles=[f'Time = {t}, Length = {l}' for l in lengths for t in times])
    
    # Populate subplots
    for row, length in enumerate(lengths, start=1):
        for col, time in enumerate(times, start=1):
            if time in ns_dict[length]:
                data = ns_dict[length][time]['ns_per_site_list']
                theoretical_ns = theoretical_ns_list[time]
                average_ns = ns_dict[length][time]['avg_ns_per_site']
                
                # Add histogram to subplot
                fig.add_trace(
                    go.Histogram(
                        x=data,
                        nbinsx=10,
                        name=f'Length {length}, Time {time}'
                    ),
                    row=row,
                    col=col
                )
                
                # Add vertical lines for average and theoretical values
                fig.add_vline(x=average_ns, line_width=3, line_dash="dash", line_color="red", row=row, col=col)
                fig.add_vline(x=theoretical_ns, line_width=3, line_dash="dash", line_color="green", row=row, col=col)
            else:
                # Handle missing data for this time point by adding an empty trace
                fig.add_trace(
                    go.Histogram(
                        x=[],
                        name=f'Length {length}, Time {time} (no data)'
                    ),
                    row=row,
                    col=col
                )

    fig.update_layout(
        yaxis_title_text='Count',
        height=300 * rows,  
        width=300 * cols,   
        showlegend=False,
        bargap=0.1,
        xaxis_title_text='Subsitution Number'
    )

    return fig


In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def get_histograms1(ns_dict, theoretical_ns_list):
    lengths = list(ns_dict.keys())
    all_times = {time for length in ns_dict for time in ns_dict[length].keys()}
    times = sorted(all_times)  # Sorting to maintain a consistent order

    rows = len(lengths)
    cols = len(times)
    
    # Create subplots
    fig = make_subplots(rows=rows, cols=cols, subplot_titles=[f'Time = {t}, Length = {l}' for l in lengths for t in times])

    # Initialize dictionary to hold x-axis ranges for each time
    x_ranges = {}
    
    # Calculate x-axis range for each time to apply consistently across corresponding columns
    for time in times:
        min_xs = []
        max_xs = []
        for length in lengths:
            if time in ns_dict[length]:
                data = ns_dict[length][time]['ns_per_site_list']
                min_xs.append(min(data))
                max_xs.append(max(data))
        x_ranges[time] = (min(min_xs), max(max_xs))

    # Populate subplots
    for row, length in enumerate(lengths, start=1):
        for col, time in enumerate(times, start=1):
            if time in ns_dict[length]:
                data = ns_dict[length][time]['ns_per_site_list']
                theoretical_ns = theoretical_ns_list[time]
                average_ns = ns_dict[length][time]['avg_ns_per_site']
                
                # Add histogram to subplot
                fig.add_trace(
                    go.Histogram(
                        x=data,
                        xbins=dict(  # Control the bar widths here
                            start=x_ranges[time][0],
                            end=x_ranges[time][1],
                            size=(x_ranges[time][1] - x_ranges[time][0]) / 20  # Adjust size for consistent bar width
                        ),
                        marker=dict(line=dict(width=1)),
                        name=f'Length {length}, Time {time}'
                    ),
                    row=row,
                    col=col
                )
                
                # Add vertical lines for average and theoretical values
                fig.add_vline(x=average_ns, line_width=2, line_dash="dash", line_color="red", row=row, col=col)
                fig.add_vline(x=theoretical_ns, line_width=2, line_dash="dash", line_color="green", row=row, col=col)

    # Update layout for all subplots
    fig.update_layout(
        height=300 * rows,
        width=300 * cols,
        showlegend=False,
        bargap=0.05  # Adjust space between bars
    )
    
    # Set consistent x-axis range across all subplots per column
    for col, time in enumerate(times, start=1):
        for r in range(1, rows+1):
            fig.update_xaxes(range=[x_ranges[time][0], x_ranges[time][1]], row=r, col=col)

    return fig


In [None]:
fig = get_histograms1(ns_dict_all, theoretical_ns_dict)
fig.update_layout(title_text='Distribution of Substitutions Number by Time and Length',
        xaxis_title_text='Subsitution Number')

fig.write_image('Distribution of Substitutions Number by Time and Length.pdf')
fig.show()

In [None]:
error_ns_dict_all = {}
for key, times in ns_dict_all.items():
    error_ns_dict_all[key] = {}
    for time, ns_value in times.items():
        ns_error_list = [a - theoretical_ns_dict[time] for a in ns_value['ns_per_site_list']]
        error_ns_dict_all[key][time] = {'avg_ns_per_site': ns_value['avg_ns_per_site']-theoretical_ns_dict[time], 'ns_per_site_list': ns_error_list}

In [None]:
theoretical_ns_error_dict = {time: 0 for time in time_range}

In [None]:


def get_histograms2(ns_dict, theoretical_ns_list):
    lengths = list(ns_dict.keys())
    all_times = {time for length in ns_dict for time in ns_dict[length].keys()}
    times = sorted(all_times)  # Sorting to maintain a consistent order

    rows = len(lengths)
    cols = len(times)
    
    # Determine global minimum and maximum x values for axis range consistency
    x_min = min(min(ns_dict[length][time]['ns_per_site_list']) for length in ns_dict for time in ns_dict[length] if time in ns_dict[length])
    x_max = max(max(ns_dict[length][time]['ns_per_site_list']) for length in ns_dict for time in ns_dict[length] if time in ns_dict[length])
    
    # Create subplots
    fig = make_subplots(rows=rows, cols=cols, subplot_titles=[f'Time = {t}, Length = {l}' for l in lengths for t in times])

    # Populate subplots
    for row, length in enumerate(lengths, start=1):
        for col, time in enumerate(times, start=1):
            data = ns_dict[length][time]['ns_per_site_list'] if time in ns_dict[length] else []
            theoretical_ns = theoretical_ns_list[time] if time in theoretical_ns_list else None
            average_ns = ns_dict[length][time]['avg_ns_per_site'] if time in ns_dict[length] else None

            # Add histogram to subplot
            fig.add_trace(
                go.Histogram(
                    x=data,
                    xbins=dict(  # Control the bar widths here
                        start=x_min,
                        end=x_max,
                        size=(x_max - x_min) / 20  # Adjust size for consistent bar width
                    ),
                    marker=dict(line=dict(width=1)),
                    name=f'Length {length}, Time {time}'
                ),
                row=row,
                col=col
            )
            
            # Add vertical lines for average and theoretical values
            fig.add_vline(x=average_ns, line_width=2, line_dash="dash", line_color="red", row=row, col=col)
            fig.add_vline(x=theoretical_ns, line_width=2, line_dash="dash", line_color="green", row=row, col=col)

    # Update layout for all subplots
    fig.update_layout(
        height=300 * rows,
        width=300 * cols,
        showlegend=False,
        bargap=0.05  # Adjust space between bars
    )
    
    # Set consistent x-axis range across all subplots
    for r in range(1, rows+1):
        for c in range(1, cols+1):
            fig.update_xaxes(range=[-0.09, 0.06], row=r, col=c)

    return fig


In [None]:
fig2 = get_histograms2(error_ns_dict_all, theoretical_ns_error_dict)
fig2.update_layout(title_text='Distribution of Substitutions Number Error by Time and Length, repetition = 100',
        xaxis_title_text='Subsitution Number Error')

fig2.write_image('Distribution of Substitutions Number Error by Time and Length.pdf')
fig2.show()

In [4]:
import sys
sys.path.insert(0, '/Users/gulugulu/repos/PuningAnalysis/src')
from cogent3.maths.matrix_exponential_integration import expected_number_subs
import numpy as np
pi = [0.35, 0.15, 0.05, 0.45]
repeats = 100
markov_order = 0
length = 1000
time_range = [0.5, 1, 2, 3, 4]
t = 3
Q1 = np.array([[-1.75025094,  0.94143256,  0.45306226,  0.35575611],
       [ 0.49505807, -2.27893035,  0.88176788,  0.90210439],
       [ 0.36411798,  0.18070926, -0.91594839,  0.37112115],
       [ 0.77758124,  0.75396143,  0.52505328, -2.05659595]])

theoretical_ns = expected_number_subs(pi, Q1, t)
theoretical_ns

4.974190140095442

In [5]:
from simulation.waiting_time_simulator_iid import average_substitution, generate_ancestor, simulate_seq
ances_seq = generate_ancestor(length, pi)
seqs = simulate_seq(3, Q1, ances_seq)
len(seqs)

4962

In [50]:
# Loop over each sequence length and time to perform simulations
ns_per_site_list, avg_ns_per_site = average_substitution(Q1, t, repeats, length, pi)

In [53]:
avg_ns_per_site

4.97912

In [6]:
from simulation.WTS_class import SeqSimulate
simulator = SeqSimulate(Q1, 1000, 100, 3, pi)
result = simulator.average_substitution(3)
result[1]

KeyboardInterrupt: 

In [7]:
from scipy import stats
repeats = [10, 50, 100, 200, 300, 500]
time_range = [0.5, 1, 2, 3, 4]
p_value_repeats = []
p_value_times = []
avg_ns_per_site_list = []
ns_per_site_list = []
repeat = 200

for time in time_range:
    simulator = SeqSimulate(Q1, 2000, repeat, 1, pi)
    result = simulator.average_substitution(time)
    ns_per_site1 = result[0]
    avg_ns_per_site = result[1]
    ns_per_site_list.append(ns_per_site1) 
    avg_ns_per_site_list.append(avg_ns_per_site)
    theoretical = expected_number_subs(pi, Q1, time)
    t_stat, p_value = stats.ttest_1samp(ns_per_site1, theoretical)
    p_value_times.append(p_value)

    

KeyboardInterrupt: 

In [None]:
p_value_times

[]

In [None]:
[0.9302196085652216,
 0.07717452701048522,
 0.5006176654234513,
 0.603339376254266,
 0.8039563687641299,
 0.8353028901063684]

In [None]:
from scipy import stats
import statistics
time_range = [0.5, 1, 2, 3, 4]
repeats = [10, 50, 100, 200, 300, 500]
p_value_repeats = []
avg_ns_per_site_list = []
for repeat in repeats:
    simulator = SeqSimulate(Q1, 2000, repeat, 1, pi)
    result = simulator.average_substitution(1)
    ns_per_site_list = result[0]
    avg_ns_per_site = result[1]
    avg_ns_per_site_list.append(avg_ns_per_site)
    theoretical = expected_number_subs(pi, Q1, 1)
    t_stat, p_value = stats.ttest_1samp(ns_per_site_list, theoretical)
    p_value_repeats.append(p_value)