In [14]:
import sys
sys.path.insert(0, '/Users/gulugulu/repos/PuningAnalysis/src')
from simulation.waiting_time_simulator import generate_ancestor, average_substitution, get_descrip_stat, simulate_seq
from cogent3.maths.matrix_exponential_integration import expected_number_subs
import numpy as np

In [15]:
#parameter set up
import json
with open('../../../data/dataset1_16s-10_initial_alignments/matrix_motif.json', 'r') as file:
    matrix_motif = json.load(file)

In [20]:
Q = {'0': np.array(matrix_motif['0']['matrix']['200580'])}
pi_0 = matrix_motif['0']['motif_prob']
repeats = 50
markov_order = 0
length = 3000
time_range = [0.5, 1, 2, 3, 4]
ances_seq = generate_ancestor(length, pi_0) 


In [21]:
# Dictionary to store results
results = {}

# Loop over each sequence length and time to perform simulations
for t in time_range:
    print(t)
    length = 3000
    ns_per_site_list, avg_ns_per_site = average_substitution(Q, t, ances_seq, repeats, markov_order)
    # Store the results
    results[f"Length {length}, Time {t}"] = {
        'avg_ns_per_site': avg_ns_per_site,
        'ns_per_site_list': ns_per_site_list
    }

with open('../../../results/length_3000.json', 'w') as outfile:
    json.dump(results, outfile, indent=4)

0.5
1
2
3
4


In [23]:
with open('../../../results/length_3000.json', 'r') as file1:
    results_3000 = json.load(file1)

with open('../../../results/length_1000.json', 'r') as file2:
    results_1000 = json.load(file2)

with open('../../../results/length_2000.json', 'r') as file3:
    results_2000 = json.load(file3)

In [96]:
results_1000_new = {t: value for t, value in zip(time_range, results_1000.values())}
results_2000_new = {t: value for t, value in zip(time_range, results_2000.values())}
results_3000_new = {t: value for t, value in zip(time_range, results_3000.values())}

In [101]:
from scipy import stats
import statistics

def get_all_stats(ns_dict, theoretical_ns_dict):
    time_range = [0.5, 1, 2, 3, 4]
    combined_results = {}

    # Loop over each time point in the ns_dict
    for i, t in enumerate(time_range):
        ns_per_site_list = ns_dict[t]['ns_per_site_list']
        theoretical_mean = theoretical_ns_dict[t]

        # Calculate descriptive statistics
        average = statistics.mean(ns_per_site_list)
        std_dev = statistics.stdev(ns_per_site_list)
        cv = (std_dev / average) * 100

        # Perform t-test
        t_stat, p_value = stats.ttest_1samp(ns_per_site_list, theoretical_mean)

        # Store results in dictionary
        combined_results[t] = {
            'average': average,
            'theoretical_ns':theoretical_ns_dict[t],
            'standard_deviation': std_dev,
            'coefficient_of_variation': cv,
            't_stat': t_stat,
            'p_value': p_value
        }

    return combined_results



In [88]:
theoretical_ns_dict = {}
for t in time_range:
    theoretical_ns_dict[t] = expected_number_subs(pi_0, Q['0'], t)

In [103]:
stats_1000 = get_all_stats(results_1000_new, theoretical_ns_dict)
stats_2000 = get_all_stats(results_2000_new, theoretical_ns_dict)
stats_3000 = get_all_stats(results_3000_new, theoretical_ns_dict)

In [110]:
import pandas as pd
stat_dict_all = {'1000': stats_1000, '2000': stats_2000, '3000': stats_3000 }


In [108]:
# Creating a DataFrame
df_list = []
for length, times in stat_dict_all.items():
    for time, stats in times.items():
        df_list.append({'Length': int(length), 'Time': time, 'Average': stats['average'], 'stdev':stats['standard_deviation'], 'cv': stats['coefficient_of_variation'], 't-value': stats['t_stat'], 'p-value': stats['p_value']})

df = pd.DataFrame(df_list)

In [109]:
print(df)

    Length  Time   Average     stdev        cv   t-value       p-value
0     1000   0.5  0.399240  0.014963  3.747786  1.231592  2.210192e-01
1     1000   1.0  0.661270  0.019365  2.928392 -0.886545  3.774722e-01
2     1000   2.0  1.021880  0.024469  2.394550 -0.795442  4.282597e-01
3     1000   3.0  1.288750  0.029474  2.287020 -1.942926  5.486562e-02
4     1000   4.0  1.521780  0.033657  2.211692 -2.488681  1.449031e-02
5     2000   0.5  0.393195  0.011181  2.843740 -3.758200  2.894599e-04
6     2000   1.0  0.657965  0.012679  1.926983 -3.960729  1.409458e-04
7     2000   2.0  1.016190  0.017089  1.681660 -4.468642  2.095863e-05
8     2000   3.0  1.281595  0.019537  1.524431 -6.593416  2.116598e-09
9     2000   4.0  1.515180  0.023071  1.522649 -6.491377  3.407978e-09
10    3000   0.5  0.399453  0.008383  2.098496  1.734442  8.912718e-02
11    3000   1.0  0.666700  0.009996  1.499321  2.626716  1.147433e-02
12    3000   2.0  1.029333  0.012707  1.234494  3.064430  3.540611e-03
13    

In [63]:
import plotly.express as px
# Time vs. p-value for each sequence length
fig_time_pvalue = px.line(df, x='Time', y='p-value', color='Length', markers=True,
                          labels={'p-value': 'P-value', 'Time': 'Time'},
                          title='P-value vs. Time for Different Sequence Lengths')
fig_time_pvalue.add_hline(y=0.05, line_width=3, line_dash="dash", line_color="red")

fig_time_pvalue.show()





In [65]:
# # Length vs. p-value for each time
# fig_length_pvalue = px.line(df, x='Length', y='p-value', color='Time', markers=True,
#                             labels={'p-value': 'P-value', 'Length': 'Sequence Length'},
#                             title='P-value vs. Sequence Length for Different Times')

# fig_length_pvalue.add_hline(y=0.05, line_width=3, line_dash="dash", line_color="red")

# fig_length_pvalue.show()