In [1]:
#!/usr/bin/env python3

import os
import re
from statsmodels.stats.multitest import multipletests

# Directory containing your .log files
log_dir = 'accumreactox_bustede'

# Output file
output_file = 'accumreactox_bustede1_FDR.txt'

# We will store all (protein_name, p_value) in a list for later correction
pvals_data = []

# 1. Parse each .log file and extract p-values
for filename in os.listdir(log_dir):
    if filename.endswith('.log'):
        filepath = os.path.join(log_dir, filename)
        # Extract the protein name from the filename
        parts = filename.split('_')
        if parts[0].startswith('NP') or parts[0].startswith('XP'):
            protein_name = '_'.join(parts[:2])
        else:
            protein_name = parts[0]

        # Read the log file
        with open(filepath, 'r') as log_f:
            lines = log_f.readlines()
            found_pval = False
            for line in lines:
                if 'Likelihood ratio test for episodic diversifying positive selection' in line:
                    # Extract the p-value
                    match = re.search(r'p =\s*([\d\.eE+-]+)', line)
                    if match:
                        p_value_str = match.group(1)
                        try:
                            p_value = float(p_value_str)
                            # Store this (protein, p-value)
                            pvals_data.append((protein_name, p_value))
                            found_pval = True
                        except ValueError:
                            print(f"{protein_name}: Could not convert p-value '{p_value_str}' to float")
                    else:
                        print(f"{protein_name}: p-value not found in line: {line.strip()}")
                    break  # Stop after finding the line
            if not found_pval:
                print(f"{protein_name}: 'Likelihood ratio test' line not found or no p-value in {filename}")

# 2. Once we've collected all p-values, apply FDR correction
p_values = [x[1] for x in pvals_data]  # just the numeric p-values

if len(p_values) == 0:
    print("No p-values found. Exiting.")
    exit()

# Benjamini-Hochberg procedure (FDR)
# returns: reject (boolean array), pvals_corrected, alphacSidak, alphacBonf
reject_array, pvals_corrected, _, _ = multipletests(p_values, alpha=0.05, method='fdr_bh')

# 3. Write results to file (only those that pass FDR, i.e., q < 0.05)
with open(output_file, 'w') as out_f:
    for i, (protein_name, original_pval) in enumerate(pvals_data):
        fdr_pval = pvals_corrected[i]
        if reject_array[i]:
            # This means fdr_pval < 0.05
            out_f.write(protein_name + '\n')
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (SIGNIFICANT)")
        else:
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (not significant)")

O15439.3: 'Likelihood ratio test' line not found or no p-value in O15439.3_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
P18054.4: 'Likelihood ratio test' line not found or no p-value in P18054.4_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
P52895.3: 'Likelihood ratio test' line not found or no p-value in P52895.3_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
Q12882.2: 'Likelihood ratio test' line not found or no p-value in Q12882.2_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
Q8WWV3.2: 'Likelihood ratio test' line not found or no p-value in Q8WWV3.2_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
Q9UBQ7.1: 'Likelihood ratio test' line not found or no p-value in Q9UBQ7.1_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
A5PLL7.3: raw p = 0.4425, FDR p = 0.5 (not significant)
NP_000509.1: raw p = 0.4998, FDR p = 0.5 (not significant)
NP_001193670.1: raw p = 0.5, FDR p = 0.5 (not significant)
NP_001257377.1: raw p = 0.28

In [2]:
#!/usr/bin/env python3

import os
import re
from statsmodels.stats.multitest import multipletests

# Directory containing your .log files
log_dir = 'senaccum_bustede'

# Output file
output_file = 'senaccum_bustede1_FDR.txt'

# We will store all (protein_name, p_value) in a list for later correction
pvals_data = []

# 1. Parse each .log file and extract p-values
for filename in os.listdir(log_dir):
    if filename.endswith('.log'):
        filepath = os.path.join(log_dir, filename)
        # Extract the protein name from the filename
        parts = filename.split('_')
        if parts[0].startswith('NP') or parts[0].startswith('XP'):
            protein_name = '_'.join(parts[:2])
        else:
            protein_name = parts[0]

        # Read the log file
        with open(filepath, 'r') as log_f:
            lines = log_f.readlines()
            found_pval = False
            for line in lines:
                if 'Likelihood ratio test for episodic diversifying positive selection' in line:
                    # Extract the p-value
                    match = re.search(r'p =\s*([\d\.eE+-]+)', line)
                    if match:
                        p_value_str = match.group(1)
                        try:
                            p_value = float(p_value_str)
                            # Store this (protein, p-value)
                            pvals_data.append((protein_name, p_value))
                            found_pval = True
                        except ValueError:
                            print(f"{protein_name}: Could not convert p-value '{p_value_str}' to float")
                    else:
                        print(f"{protein_name}: p-value not found in line: {line.strip()}")
                    break  # Stop after finding the line
            if not found_pval:
                print(f"{protein_name}: 'Likelihood ratio test' line not found or no p-value in {filename}")

# 2. Once we've collected all p-values, apply FDR correction
p_values = [x[1] for x in pvals_data]  # just the numeric p-values

if len(p_values) == 0:
    print("No p-values found. Exiting.")
    exit()

# Benjamini-Hochberg procedure (FDR)
# returns: reject (boolean array), pvals_corrected, alphacSidak, alphacBonf
reject_array, pvals_corrected, _, _ = multipletests(p_values, alpha=0.05, method='fdr_bh')

# 3. Write results to file (only those that pass FDR, i.e., q < 0.05)
with open(output_file, 'w') as out_f:
    for i, (protein_name, original_pval) in enumerate(pvals_data):
        fdr_pval = pvals_corrected[i]
        if reject_array[i]:
            # This means fdr_pval < 0.05
            out_f.write(protein_name + '\n')
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (SIGNIFICANT)")
        else:
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (not significant)")

O15119.4: 'Likelihood ratio test' line not found or no p-value in O15119.4_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
P06748.2: 'Likelihood ratio test' line not found or no p-value in P06748.2_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
P45985.1: 'Likelihood ratio test' line not found or no p-value in P45985.1_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
NP_001341661.1: raw p = 0.5, FDR p = 0.5 (not significant)
NP_005421.1: raw p = 0.5, FDR p = 0.5 (not significant)
NP_055724.1: raw p = 0.5, FDR p = 0.5 (not significant)
O43324.1: raw p = 0.5, FDR p = 0.5 (not significant)
P01112.1: raw p = 0.4939, FDR p = 0.5 (not significant)
P05121.1: raw p = 0.5, FDR p = 0.5 (not significant)
P08069.1: raw p = 0.5, FDR p = 0.5 (not significant)
P24941.2: raw p = 0.3828, FDR p = 0.5 (not significant)
P37198.3: raw p = 0.5, FDR p = 0.5 (not significant)
P38936.3: raw p = 0.326, FDR p = 0.5 (not significant)
P41182.1: raw p = 0.5, FDR p = 0.5 (not significant

In [3]:
#!/usr/bin/env python3

import os
import re
from statsmodels.stats.multitest import multipletests

# Directory containing your .log files
log_dir = 'chromatinchange_bustede'

# Output file
output_file = 'ecmchange_bustede1_FDR.txt'

# We will store all (protein_name, p_value) in a list for later correction
pvals_data = []

# 1. Parse each .log file and extract p-values
for filename in os.listdir(log_dir):
    if filename.endswith('.log'):
        filepath = os.path.join(log_dir, filename)
        # Extract the protein name from the filename
        parts = filename.split('_')
        if parts[0].startswith('NP') or parts[0].startswith('XP'):
            protein_name = '_'.join(parts[:2])
        else:
            protein_name = parts[0]

        # Read the log file
        with open(filepath, 'r') as log_f:
            lines = log_f.readlines()
            found_pval = False
            for line in lines:
                if 'Likelihood ratio test for episodic diversifying positive selection' in line:
                    # Extract the p-value
                    match = re.search(r'p =\s*([\d\.eE+-]+)', line)
                    if match:
                        p_value_str = match.group(1)
                        try:
                            p_value = float(p_value_str)
                            # Store this (protein, p-value)
                            pvals_data.append((protein_name, p_value))
                            found_pval = True
                        except ValueError:
                            print(f"{protein_name}: Could not convert p-value '{p_value_str}' to float")
                    else:
                        print(f"{protein_name}: p-value not found in line: {line.strip()}")
                    break  # Stop after finding the line
            if not found_pval:
                print(f"{protein_name}: 'Likelihood ratio test' line not found or no p-value in {filename}")

# 2. Once we've collected all p-values, apply FDR correction
p_values = [x[1] for x in pvals_data]  # just the numeric p-values

if len(p_values) == 0:
    print("No p-values found. Exiting.")
    exit()

# Benjamini-Hochberg procedure (FDR)
# returns: reject (boolean array), pvals_corrected, alphacSidak, alphacBonf
reject_array, pvals_corrected, _, _ = multipletests(p_values, alpha=0.05, method='fdr_bh')

# 3. Write results to file (only those that pass FDR, i.e., q < 0.05)
with open(output_file, 'w') as out_f:
    for i, (protein_name, original_pval) in enumerate(pvals_data):
        fdr_pval = pvals_corrected[i]
        if reject_array[i]:
            # This means fdr_pval < 0.05
            out_f.write(protein_name + '\n')
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (SIGNIFICANT)")
        else:
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (not significant)")

NP_000543.3: raw p = 0.5, FDR p = 0.5 (not significant)
NP_001835.3: raw p = 0.5, FDR p = 0.5 (not significant)
O00391.3: raw p = 0.2293, FDR p = 0.5 (not significant)
O14672.1: raw p = 0.5, FDR p = 0.5 (not significant)
O15230.8: raw p = 0.5, FDR p = 0.5 (not significant)
O60882.3: raw p = 0.5, FDR p = 0.5 (not significant)
O95967.3: raw p = 0.5, FDR p = 0.5 (not significant)
P01137.3: raw p = 0.5, FDR p = 0.5 (not significant)
P02461.4: raw p = 0.5, FDR p = 0.5 (not significant)
P05981.1: raw p = 0.2357, FDR p = 0.5 (not significant)
P06396.1: raw p = 0.5, FDR p = 0.5 (not significant)
P08123.7: raw p = 0.4145, FDR p = 0.5 (not significant)
P08253.2: raw p = 0.1724, FDR p = 0.5 (not significant)
P08254.2: raw p = 0.4245, FDR p = 0.5 (not significant)
P08493.2: raw p = 0.5, FDR p = 0.5 (not significant)
P08572.4: raw p = 0.5, FDR p = 0.5 (not significant)
P09237.1: raw p = 0.1823, FDR p = 0.5 (not significant)
P09958.2: raw p = 0.4416, FDR p = 0.5 (not significant)
P11047.3: raw p = 0

In [1]:
#!/usr/bin/env python3

import os
import re
from statsmodels.stats.multitest import multipletests

# Directory containing your .log files
log_dir = 'telomere_bustede'

# Output file
output_file = 'telomere_bustede1_FDR.txt'

# We will store all (protein_name, p_value) in a list for later correction
pvals_data = []

# 1. Parse each .log file and extract p-values
for filename in os.listdir(log_dir):
    if filename.endswith('.log'):
        filepath = os.path.join(log_dir, filename)
        # Extract the protein name from the filename
        parts = filename.split('_')
        if parts[0].startswith('NP') or parts[0].startswith('XP'):
            protein_name = '_'.join(parts[:2])
        else:
            protein_name = parts[0]

        # Read the log file
        with open(filepath, 'r') as log_f:
            lines = log_f.readlines()
            found_pval = False
            for line in lines:
                if 'Likelihood ratio test for episodic diversifying positive selection' in line:
                    # Extract the p-value
                    match = re.search(r'p =\s*([\d\.eE+-]+)', line)
                    if match:
                        p_value_str = match.group(1)
                        try:
                            p_value = float(p_value_str)
                            # Store this (protein, p-value)
                            pvals_data.append((protein_name, p_value))
                            found_pval = True
                        except ValueError:
                            print(f"{protein_name}: Could not convert p-value '{p_value_str}' to float")
                    else:
                        print(f"{protein_name}: p-value not found in line: {line.strip()}")
                    break  # Stop after finding the line
            if not found_pval:
                print(f"{protein_name}: 'Likelihood ratio test' line not found or no p-value in {filename}")

# 2. Once we've collected all p-values, apply FDR correction
p_values = [x[1] for x in pvals_data]  # just the numeric p-values

if len(p_values) == 0:
    print("No p-values found. Exiting.")
    exit()

# Benjamini-Hochberg procedure (FDR)
# returns: reject (boolean array), pvals_corrected, alphacSidak, alphacBonf
reject_array, pvals_corrected, _, _ = multipletests(p_values, alpha=0.05, method='fdr_bh')

# 3. Write results to file (only those that pass FDR, i.e., q < 0.05)
with open(output_file, 'w') as out_f:
    for i, (protein_name, original_pval) in enumerate(pvals_data):
        fdr_pval = pvals_corrected[i]
        if reject_array[i]:
            # This means fdr_pval < 0.05
            out_f.write(protein_name + '\n')
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (SIGNIFICANT)")
        else:
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (not significant)")

P12004.1: 'Likelihood ratio test' line not found or no p-value in P12004.1_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
P27694.2: 'Likelihood ratio test' line not found or no p-value in P27694.2_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
P49959.3: 'Likelihood ratio test' line not found or no p-value in P49959.3_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
P78527.3: 'Likelihood ratio test' line not found or no p-value in P78527.3_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
Q8IY18.2: 'Likelihood ratio test' line not found or no p-value in Q8IY18.2_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
Q9BQ83.1: 'Likelihood ratio test' line not found or no p-value in Q9BQ83.1_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
P07992.1: raw p = 0.1606, FDR p = 0.5 (not significant)
P12956.2: raw p = 0.3752, FDR p = 0.5 (not significant)
P13010.3: raw p = 0.5, FDR p = 0.5 (not significant)
P18887.3: raw p = 0.5, FDR p = 0.5 (n

In [2]:
#!/usr/bin/env python3

import os
import re
from statsmodels.stats.multitest import multipletests

# Directory containing your .log files
log_dir = 'TOR_bustede'

# Output file
output_file = 'TOR_bustede1_FDR.txt'

# We will store all (protein_name, p_value) in a list for later correction
pvals_data = []

# 1. Parse each .log file and extract p-values
for filename in os.listdir(log_dir):
    if filename.endswith('.log'):
        filepath = os.path.join(log_dir, filename)
        # Extract the protein name from the filename
        parts = filename.split('_')
        if parts[0].startswith('NP') or parts[0].startswith('XP'):
            protein_name = '_'.join(parts[:2])
        else:
            protein_name = parts[0]

        # Read the log file
        with open(filepath, 'r') as log_f:
            lines = log_f.readlines()
            found_pval = False
            for line in lines:
                if 'Likelihood ratio test for episodic diversifying positive selection' in line:
                    # Extract the p-value
                    match = re.search(r'p =\s*([\d\.eE+-]+)', line)
                    if match:
                        p_value_str = match.group(1)
                        try:
                            p_value = float(p_value_str)
                            # Store this (protein, p-value)
                            pvals_data.append((protein_name, p_value))
                            found_pval = True
                        except ValueError:
                            print(f"{protein_name}: Could not convert p-value '{p_value_str}' to float")
                    else:
                        print(f"{protein_name}: p-value not found in line: {line.strip()}")
                    break  # Stop after finding the line
            if not found_pval:
                print(f"{protein_name}: 'Likelihood ratio test' line not found or no p-value in {filename}")

# 2. Once we've collected all p-values, apply FDR correction
p_values = [x[1] for x in pvals_data]  # just the numeric p-values

if len(p_values) == 0:
    print("No p-values found. Exiting.")
    exit()

# Benjamini-Hochberg procedure (FDR)
# returns: reject (boolean array), pvals_corrected, alphacSidak, alphacBonf
reject_array, pvals_corrected, _, _ = multipletests(p_values, alpha=0.05, method='fdr_bh')

# 3. Write results to file (only those that pass FDR, i.e., q < 0.05)
with open(output_file, 'w') as out_f:
    for i, (protein_name, original_pval) in enumerate(pvals_data):
        fdr_pval = pvals_corrected[i]
        if reject_array[i]:
            # This means fdr_pval < 0.05
            out_f.write(protein_name + '\n')
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (SIGNIFICANT)")
        else:
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (not significant)")

O60478.1: 'Likelihood ratio test' line not found or no p-value in O60478.1_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
NP_061931.1: raw p = 0, FDR p = 0 (SIGNIFICANT)
O00327.2: raw p = 0.4891, FDR p = 0.5 (not significant)
O15379.2: raw p = 0.5, FDR p = 0.5 (not significant)
O43504.1: raw p = 0.5, FDR p = 0.5 (not significant)
P08F94.1: raw p = 0.5, FDR p = 0.5 (not significant)
P23443.2: raw p = 0.5, FDR p = 0.5 (not significant)
P31749.2: raw p = 0.4907, FDR p = 0.5 (not significant)
P41159.1: raw p = 0.5, FDR p = 0.5 (not significant)
P42345.1: raw p = 0.5, FDR p = 0.5 (not significant)
P78509.3: raw p = 0.5, FDR p = 0.5 (not significant)
Q03113.4: raw p = 0.4968, FDR p = 0.5 (not significant)
Q13541.3: raw p = 0.0895, FDR p = 0.5 (not significant)
Q6R327.1: raw p = 0.5, FDR p = 0.5 (not significant)
Q8N122.1: raw p = 0.462, FDR p = 0.5 (not significant)
Q96A49.1: raw p = 0.4504, FDR p = 0.5 (not significant)
Q96B36.1: raw p = 0.5, FDR p = 0.5 (not significant)
Q9BPZ7.

In [3]:
#!/usr/bin/env python3

import os
import re
from statsmodels.stats.multitest import multipletests

# Directory containing your .log files
log_dir = 'nucarc_bustede'

# Output file
output_file = 'nucarc_bustede1_FDR.txt'

# We will store all (protein_name, p_value) in a list for later correction
pvals_data = []

# 1. Parse each .log file and extract p-values
for filename in os.listdir(log_dir):
    if filename.endswith('.log'):
        filepath = os.path.join(log_dir, filename)
        # Extract the protein name from the filename
        parts = filename.split('_')
        if parts[0].startswith('NP') or parts[0].startswith('XP'):
            protein_name = '_'.join(parts[:2])
        else:
            protein_name = parts[0]

        # Read the log file
        with open(filepath, 'r') as log_f:
            lines = log_f.readlines()
            found_pval = False
            for line in lines:
                if 'Likelihood ratio test for episodic diversifying positive selection' in line:
                    # Extract the p-value
                    match = re.search(r'p =\s*([\d\.eE+-]+)', line)
                    if match:
                        p_value_str = match.group(1)
                        try:
                            p_value = float(p_value_str)
                            # Store this (protein, p-value)
                            pvals_data.append((protein_name, p_value))
                            found_pval = True
                        except ValueError:
                            print(f"{protein_name}: Could not convert p-value '{p_value_str}' to float")
                    else:
                        print(f"{protein_name}: p-value not found in line: {line.strip()}")
                    break  # Stop after finding the line
            if not found_pval:
                print(f"{protein_name}: 'Likelihood ratio test' line not found or no p-value in {filename}")

# 2. Once we've collected all p-values, apply FDR correction
p_values = [x[1] for x in pvals_data]  # just the numeric p-values

if len(p_values) == 0:
    print("No p-values found. Exiting.")
    exit()

# Benjamini-Hochberg procedure (FDR)
# returns: reject (boolean array), pvals_corrected, alphacSidak, alphacBonf
reject_array, pvals_corrected, _, _ = multipletests(p_values, alpha=0.05, method='fdr_bh')

# 3. Write results to file (only those that pass FDR, i.e., q < 0.05)
with open(output_file, 'w') as out_f:
    for i, (protein_name, original_pval) in enumerate(pvals_data):
        fdr_pval = pvals_corrected[i]
        if reject_array[i]:
            # This means fdr_pval < 0.05
            out_f.write(protein_name + '\n')
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (SIGNIFICANT)")
        else:
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (not significant)")

P48739.2: 'Likelihood ratio test' line not found or no p-value in P48739.2_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
P52948.4: 'Likelihood ratio test' line not found or no p-value in P52948.4_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
NP_001185486.1: raw p = 0.4973, FDR p = 0.5 (not significant)
NP_001265138.1: raw p = 0.5, FDR p = 0.5 (not significant)
O00499.1: raw p = 0.5, FDR p = 0.5 (not significant)
O94901.4: raw p = 0.0419, FDR p = 0.5 (not significant)
P02545.1: raw p = 0.5, FDR p = 0.5 (not significant)
P05771.4: raw p = 0.5, FDR p = 0.5 (not significant)
P17252.4: raw p = 0.2806, FDR p = 0.5 (not significant)
P26583.2: raw p = 0.5, FDR p = 0.5 (not significant)
P29590.3: raw p = 0.5, FDR p = 0.5 (not significant)
P43034.2: raw p = 0.5, FDR p = 0.5 (not significant)
P52594.2: raw p = 0.5, FDR p = 0.5 (not significant)
Q12912.3: raw p = 0.5, FDR p = 0.5 (not significant)
Q14693.2: raw p = 0.5, FDR p = 0.5 (not significant)
Q14980.2: raw p = 0.5, F

без fdr:
O94901.4: raw p = 0.0419



In [4]:
#!/usr/bin/env python3

import os
import re
from statsmodels.stats.multitest import multipletests

# Directory containing your .log files
log_dir = 'mitdna_bustede'

# Output file
output_file = 'mitdna_bustede1_FDR.txt'

# We will store all (protein_name, p_value) in a list for later correction
pvals_data = []

# 1. Parse each .log file and extract p-values
for filename in os.listdir(log_dir):
    if filename.endswith('.log'):
        filepath = os.path.join(log_dir, filename)
        # Extract the protein name from the filename
        parts = filename.split('_')
        if parts[0].startswith('NP') or parts[0].startswith('XP'):
            protein_name = '_'.join(parts[:2])
        else:
            protein_name = parts[0]

        # Read the log file
        with open(filepath, 'r') as log_f:
            lines = log_f.readlines()
            found_pval = False
            for line in lines:
                if 'Likelihood ratio test for episodic diversifying positive selection' in line:
                    # Extract the p-value
                    match = re.search(r'p =\s*([\d\.eE+-]+)', line)
                    if match:
                        p_value_str = match.group(1)
                        try:
                            p_value = float(p_value_str)
                            # Store this (protein, p-value)
                            pvals_data.append((protein_name, p_value))
                            found_pval = True
                        except ValueError:
                            print(f"{protein_name}: Could not convert p-value '{p_value_str}' to float")
                    else:
                        print(f"{protein_name}: p-value not found in line: {line.strip()}")
                    break  # Stop after finding the line
            if not found_pval:
                print(f"{protein_name}: 'Likelihood ratio test' line not found or no p-value in {filename}")

# 2. Once we've collected all p-values, apply FDR correction
p_values = [x[1] for x in pvals_data]  # just the numeric p-values

if len(p_values) == 0:
    print("No p-values found. Exiting.")
    exit()

# Benjamini-Hochberg procedure (FDR)
# returns: reject (boolean array), pvals_corrected, alphacSidak, alphacBonf
reject_array, pvals_corrected, _, _ = multipletests(p_values, alpha=0.05, method='fdr_bh')

# 3. Write results to file (only those that pass FDR, i.e., q < 0.05)
with open(output_file, 'w') as out_f:
    for i, (protein_name, original_pval) in enumerate(pvals_data):
        fdr_pval = pvals_corrected[i]
        if reject_array[i]:
            # This means fdr_pval < 0.05
            out_f.write(protein_name + '\n')
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (SIGNIFICANT)")
        else:
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (not significant)")

P23921.1: raw p = 0.5, FDR p = 0.5 (not significant)
Q12888: raw p = 0.5, FDR p = 0.5 (not significant)
Q9Y243.1: raw p = 0.5, FDR p = 0.5 (not significant)


In [5]:
#!/usr/bin/env python3

import os
import re
from statsmodels.stats.multitest import multipletests

# Directory containing your .log files
log_dir = 'transcr_bustede'

# Output file
output_file = 'transcr_bustede1_FDR.txt'

# We will store all (protein_name, p_value) in a list for later correction
pvals_data = []

# 1. Parse each .log file and extract p-values
for filename in os.listdir(log_dir):
    if filename.endswith('.log'):
        filepath = os.path.join(log_dir, filename)
        # Extract the protein name from the filename
        parts = filename.split('_')
        if parts[0].startswith('NP') or parts[0].startswith('XP'):
            protein_name = '_'.join(parts[:2])
        else:
            protein_name = parts[0]

        # Read the log file
        with open(filepath, 'r') as log_f:
            lines = log_f.readlines()
            found_pval = False
            for line in lines:
                if 'Likelihood ratio test for episodic diversifying positive selection' in line:
                    # Extract the p-value
                    match = re.search(r'p =\s*([\d\.eE+-]+)', line)
                    if match:
                        p_value_str = match.group(1)
                        try:
                            p_value = float(p_value_str)
                            # Store this (protein, p-value)
                            pvals_data.append((protein_name, p_value))
                            found_pval = True
                        except ValueError:
                            print(f"{protein_name}: Could not convert p-value '{p_value_str}' to float")
                    else:
                        print(f"{protein_name}: p-value not found in line: {line.strip()}")
                    break  # Stop after finding the line
            if not found_pval:
                print(f"{protein_name}: 'Likelihood ratio test' line not found or no p-value in {filename}")

# 2. Once we've collected all p-values, apply FDR correction
p_values = [x[1] for x in pvals_data]  # just the numeric p-values

if len(p_values) == 0:
    print("No p-values found. Exiting.")
    exit()

# Benjamini-Hochberg procedure (FDR)
# returns: reject (boolean array), pvals_corrected, alphacSidak, alphacBonf
reject_array, pvals_corrected, _, _ = multipletests(p_values, alpha=0.05, method='fdr_bh')

# 3. Write results to file (only those that pass FDR, i.e., q < 0.05)
with open(output_file, 'w') as out_f:
    for i, (protein_name, original_pval) in enumerate(pvals_data):
        fdr_pval = pvals_corrected[i]
        if reject_array[i]:
            # This means fdr_pval < 0.05
            out_f.write(protein_name + '\n')
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (SIGNIFICANT)")
        else:
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (not significant)")

P52739.2: 'Likelihood ratio test' line not found or no p-value in P52739.2_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
Q06546.1: 'Likelihood ratio test' line not found or no p-value in Q06546.1_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
Q99592.1: 'Likelihood ratio test' line not found or no p-value in Q99592.1_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
Q9H422.1: 'Likelihood ratio test' line not found or no p-value in Q9H422.1_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
A8K855.1: raw p = 0.5, FDR p = 0.5 (not significant)
A8K8V0.1: raw p = 0.5, FDR p = 0.5 (not significant)
NP_001138774.1: raw p = 0.5, FDR p = 0.5 (not significant)
NP_001289889.1: raw p = 0.5, FDR p = 0.5 (not significant)
NP_001358355.1: raw p = 0.5, FDR p = 0.5 (not significant)
NP_001373664.1: raw p = 0.5, FDR p = 0.5 (not significant)
NP_001955.1: raw p = 0.5, FDR p = 0.5 (not significant)
NP_004307.2: raw p = 0.5, FDR p = 0.5 (not significant)
O00257.3: raw 

без fdr:
P09017.2: raw p = 0.0461
P17023.4: raw p = 0.0253
P19883.2: raw p = 0.027
Q01543.1: raw p = 0.0021
Q2M1K9.1: raw p = 0.0327

**chromatin remodel
INS
nucdna
alt in hist**

In [6]:
#!/usr/bin/env python3

import os
import re
from statsmodels.stats.multitest import multipletests

# Directory containing your .log files
log_dir = 'nucdna_bustede'

# Output file
output_file = 'nucdna_bustede1_FDR.txt'

# We will store all (protein_name, p_value) in a list for later correction
pvals_data = []

# 1. Parse each .log file and extract p-values
for filename in os.listdir(log_dir):
    if filename.endswith('.log'):
        filepath = os.path.join(log_dir, filename)
        # Extract the protein name from the filename
        parts = filename.split('_')
        if parts[0].startswith('NP') or parts[0].startswith('XP'):
            protein_name = '_'.join(parts[:2])
        else:
            protein_name = parts[0]

        # Read the log file
        with open(filepath, 'r') as log_f:
            lines = log_f.readlines()
            found_pval = False
            for line in lines:
                if 'Likelihood ratio test for episodic diversifying positive selection' in line:
                    # Extract the p-value
                    match = re.search(r'p =\s*([\d\.eE+-]+)', line)
                    if match:
                        p_value_str = match.group(1)
                        try:
                            p_value = float(p_value_str)
                            # Store this (protein, p-value)
                            pvals_data.append((protein_name, p_value))
                            found_pval = True
                        except ValueError:
                            print(f"{protein_name}: Could not convert p-value '{p_value_str}' to float")
                    else:
                        print(f"{protein_name}: p-value not found in line: {line.strip()}")
                    break  # Stop after finding the line
            if not found_pval:
                print(f"{protein_name}: 'Likelihood ratio test' line not found or no p-value in {filename}")

# 2. Once we've collected all p-values, apply FDR correction
p_values = [x[1] for x in pvals_data]  # just the numeric p-values

if len(p_values) == 0:
    print("No p-values found. Exiting.")
    exit()

# Benjamini-Hochberg procedure (FDR)
# returns: reject (boolean array), pvals_corrected, alphacSidak, alphacBonf
reject_array, pvals_corrected, _, _ = multipletests(p_values, alpha=0.05, method='fdr_bh')

# 3. Write results to file (only those that pass FDR, i.e., q < 0.05)
with open(output_file, 'w') as out_f:
    for i, (protein_name, original_pval) in enumerate(pvals_data):
        fdr_pval = pvals_corrected[i]
        if reject_array[i]:
            # This means fdr_pval < 0.05
            out_f.write(protein_name + '\n')
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (SIGNIFICANT)")
        else:
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (not significant)")

O95243.1: 'Likelihood ratio test' line not found or no p-value in O95243.1_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
P23025.1: 'Likelihood ratio test' line not found or no p-value in P23025.1_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
P36639.4: 'Likelihood ratio test' line not found or no p-value in P36639.4_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
P49642.1: 'Likelihood ratio test' line not found or no p-value in P49642.1_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
Q9NTJ3.2: 'Likelihood ratio test' line not found or no p-value in Q9NTJ3.2_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
NP_001131026.1: raw p = 0.2618, FDR p = 0.5 (not significant)
O00141.2: raw p = 0.5, FDR p = 0.5 (not significant)
O00571.3: raw p = 0.5, FDR p = 0.5 (not significant)
O43264.3: raw p = 0.5, FDR p = 0.5 (not significant)
O43543.1: raw p = 0.5, FDR p = 0.5 (not significant)
O43823.1: raw p = 0.5, FDR p = 0.5 (not significant)
O60216.2

In [7]:
#!/usr/bin/env python3

import os
import re
from statsmodels.stats.multitest import multipletests

# Directory containing your .log files
log_dir = 'INS_bustede'

# Output file
output_file = 'INS_bustede1_FDR.txt'

# We will store all (protein_name, p_value) in a list for later correction
pvals_data = []

# 1. Parse each .log file and extract p-values
for filename in os.listdir(log_dir):
    if filename.endswith('.log'):
        filepath = os.path.join(log_dir, filename)
        # Extract the protein name from the filename
        parts = filename.split('_')
        if parts[0].startswith('NP') or parts[0].startswith('XP'):
            protein_name = '_'.join(parts[:2])
        else:
            protein_name = parts[0]

        # Read the log file
        with open(filepath, 'r') as log_f:
            lines = log_f.readlines()
            found_pval = False
            for line in lines:
                if 'Likelihood ratio test for episodic diversifying positive selection' in line:
                    # Extract the p-value
                    match = re.search(r'p =\s*([\d\.eE+-]+)', line)
                    if match:
                        p_value_str = match.group(1)
                        try:
                            p_value = float(p_value_str)
                            # Store this (protein, p-value)
                            pvals_data.append((protein_name, p_value))
                            found_pval = True
                        except ValueError:
                            print(f"{protein_name}: Could not convert p-value '{p_value_str}' to float")
                    else:
                        print(f"{protein_name}: p-value not found in line: {line.strip()}")
                    break  # Stop after finding the line
            if not found_pval:
                print(f"{protein_name}: 'Likelihood ratio test' line not found or no p-value in {filename}")

# 2. Once we've collected all p-values, apply FDR correction
p_values = [x[1] for x in pvals_data]  # just the numeric p-values

if len(p_values) == 0:
    print("No p-values found. Exiting.")
    exit()

# Benjamini-Hochberg procedure (FDR)
# returns: reject (boolean array), pvals_corrected, alphacSidak, alphacBonf
reject_array, pvals_corrected, _, _ = multipletests(p_values, alpha=0.05, method='fdr_bh')

# 3. Write results to file (only those that pass FDR, i.e., q < 0.05)
with open(output_file, 'w') as out_f:
    for i, (protein_name, original_pval) in enumerate(pvals_data):
        fdr_pval = pvals_corrected[i]
        if reject_array[i]:
            # This means fdr_pval < 0.05
            out_f.write(protein_name + '\n')
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (SIGNIFICANT)")
        else:
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (not significant)")

NP_005535.1: 'Likelihood ratio test' line not found or no p-value in NP_005535.1_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
O00443.2: 'Likelihood ratio test' line not found or no p-value in O00443.2_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
P23229.5: 'Likelihood ratio test' line not found or no p-value in P23229.5_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
P29353.4: 'Likelihood ratio test' line not found or no p-value in P29353.4_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
P43490.1: 'Likelihood ratio test' line not found or no p-value in P43490.1_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
P98164.3: 'Likelihood ratio test' line not found or no p-value in P98164.3_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
Q92572.1: 'Likelihood ratio test' line not found or no p-value in Q92572.1_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
NP_000589.2: raw p = 0.0241, FDR p = 0.233 (not significant)
NP

In [8]:
#!/usr/bin/env python3

import os
import re
from statsmodels.stats.multitest import multipletests

# Directory containing your .log files
log_dir = 'chromremodel_bustede'

# Output file
output_file = 'chromremodel_bustede1_FDR.txt'

# We will store all (protein_name, p_value) in a list for later correction
pvals_data = []

# 1. Parse each .log file and extract p-values
for filename in os.listdir(log_dir):
    if filename.endswith('.log'):
        filepath = os.path.join(log_dir, filename)
        # Extract the protein name from the filename
        parts = filename.split('_')
        if parts[0].startswith('NP') or parts[0].startswith('XP'):
            protein_name = '_'.join(parts[:2])
        else:
            protein_name = parts[0]

        # Read the log file
        with open(filepath, 'r') as log_f:
            lines = log_f.readlines()
            found_pval = False
            for line in lines:
                if 'Likelihood ratio test for episodic diversifying positive selection' in line:
                    # Extract the p-value
                    match = re.search(r'p =\s*([\d\.eE+-]+)', line)
                    if match:
                        p_value_str = match.group(1)
                        try:
                            p_value = float(p_value_str)
                            # Store this (protein, p-value)
                            pvals_data.append((protein_name, p_value))
                            found_pval = True
                        except ValueError:
                            print(f"{protein_name}: Could not convert p-value '{p_value_str}' to float")
                    else:
                        print(f"{protein_name}: p-value not found in line: {line.strip()}")
                    break  # Stop after finding the line
            if not found_pval:
                print(f"{protein_name}: 'Likelihood ratio test' line not found or no p-value in {filename}")

# 2. Once we've collected all p-values, apply FDR correction
p_values = [x[1] for x in pvals_data]  # just the numeric p-values

if len(p_values) == 0:
    print("No p-values found. Exiting.")
    exit()

# Benjamini-Hochberg procedure (FDR)
# returns: reject (boolean array), pvals_corrected, alphacSidak, alphacBonf
reject_array, pvals_corrected, _, _ = multipletests(p_values, alpha=0.05, method='fdr_bh')

# 3. Write results to file (only those that pass FDR, i.e., q < 0.05)
with open(output_file, 'w') as out_f:
    for i, (protein_name, original_pval) in enumerate(pvals_data):
        fdr_pval = pvals_corrected[i]
        if reject_array[i]:
            # This means fdr_pval < 0.05
            out_f.write(protein_name + '\n')
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (SIGNIFICANT)")
        else:
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (not significant)")

A6NHR9.2: 'Likelihood ratio test' line not found or no p-value in A6NHR9.2_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
NP_004955.2: 'Likelihood ratio test' line not found or no p-value in NP_004955.2_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
P07910.4: 'Likelihood ratio test' line not found or no p-value in P07910.4_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
Q92769.2: 'Likelihood ratio test' line not found or no p-value in Q92769.2_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
O15054.4: raw p = 0.5, FDR p = 0.5 (not significant)
O95863.2: raw p = 0.5, FDR p = 0.5 (not significant)
P06400.2: raw p = 0.2928, FDR p = 0.5 (not significant)
P11387.2: raw p = 0.5, FDR p = 0.5 (not significant)
P25440.2: raw p = 0.4989, FDR p = 0.5 (not significant)
P42898.3: raw p = 0.5, FDR p = 0.5 (not significant)
P51532.2: raw p = 0.5, FDR p = 0.5 (not significant)
P55209.1: raw p = 0.4037, FDR p = 0.5 (not significant)
P62805.2: raw p = 0.5, FDR p 

In [9]:
#!/usr/bin/env python3

import os
import re
from statsmodels.stats.multitest import multipletests

# Directory containing your .log files
log_dir = 'histalt_bustede'

# Output file
output_file = 'histalt_bustede1_FDR.txt'

# We will store all (protein_name, p_value) in a list for later correction
pvals_data = []

# 1. Parse each .log file and extract p-values
for filename in os.listdir(log_dir):
    if filename.endswith('.log'):
        filepath = os.path.join(log_dir, filename)
        # Extract the protein name from the filename
        parts = filename.split('_')
        if parts[0].startswith('NP') or parts[0].startswith('XP'):
            protein_name = '_'.join(parts[:2])
        else:
            protein_name = parts[0]

        # Read the log file
        with open(filepath, 'r') as log_f:
            lines = log_f.readlines()
            found_pval = False
            for line in lines:
                if 'Likelihood ratio test for episodic diversifying positive selection' in line:
                    # Extract the p-value
                    match = re.search(r'p =\s*([\d\.eE+-]+)', line)
                    if match:
                        p_value_str = match.group(1)
                        try:
                            p_value = float(p_value_str)
                            # Store this (protein, p-value)
                            pvals_data.append((protein_name, p_value))
                            found_pval = True
                        except ValueError:
                            print(f"{protein_name}: Could not convert p-value '{p_value_str}' to float")
                    else:
                        print(f"{protein_name}: p-value not found in line: {line.strip()}")
                    break  # Stop after finding the line
            if not found_pval:
                print(f"{protein_name}: 'Likelihood ratio test' line not found or no p-value in {filename}")

# 2. Once we've collected all p-values, apply FDR correction
p_values = [x[1] for x in pvals_data]  # just the numeric p-values

if len(p_values) == 0:
    print("No p-values found. Exiting.")
    exit()

# Benjamini-Hochberg procedure (FDR)
# returns: reject (boolean array), pvals_corrected, alphacSidak, alphacBonf
reject_array, pvals_corrected, _, _ = multipletests(p_values, alpha=0.05, method='fdr_bh')

# 3. Write results to file (only those that pass FDR, i.e., q < 0.05)
with open(output_file, 'w') as out_f:
    for i, (protein_name, original_pval) in enumerate(pvals_data):
        fdr_pval = pvals_corrected[i]
        if reject_array[i]:
            # This means fdr_pval < 0.05
            out_f.write(protein_name + '\n')
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (SIGNIFICANT)")
        else:
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (not significant)")

O43318.1: 'Likelihood ratio test' line not found or no p-value in O43318.1_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
P23246.2: 'Likelihood ratio test' line not found or no p-value in P23246.2_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
Q13620.4: 'Likelihood ratio test' line not found or no p-value in Q13620.4_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
NP_001309133.1: raw p = 0.5, FDR p = 0.5 (not significant)
NP_056153.2: raw p = 0.4765, FDR p = 0.5 (not significant)
NP_085150.1: raw p = 0.5, FDR p = 0.5 (not significant)
O00213.2: raw p = 0.5, FDR p = 0.5 (not significant)
O15516.1: raw p = 0.5, FDR p = 0.5 (not significant)
O60341.2: raw p = 0.5, FDR p = 0.5 (not significant)
O60907.3: raw p = 0.5, FDR p = 0.5 (not significant)
O75582.1: raw p = 0.5, FDR p = 0.5 (not significant)
O95696.1: raw p = 0.5, FDR p = 0.5 (not significant)
P15336.4: raw p = 0.5, FDR p = 0.5 (not significant)
P25208.2: raw p = 0.5, FDR p = 0.5 (not significant)
P63

In [2]:
#!/usr/bin/env python3

import os
import re
from statsmodels.stats.multitest import multipletests

# Directory containing your .log files
log_dir = 'intercom_busted'

# Output file
output_file = 'intercom_bustede1_FDR.txt'

# We will store all (protein_name, p_value) in a list for later correction
pvals_data = []

# 1. Parse each .log file and extract p-values
for filename in os.listdir(log_dir):
    if filename.endswith('.log'):
        filepath = os.path.join(log_dir, filename)
        # Extract the protein name from the filename
        parts = filename.split('_')
        if parts[0].startswith('NP') or parts[0].startswith('XP'):
            protein_name = '_'.join(parts[:2])
        else:
            protein_name = parts[0]

        # Read the log file
        with open(filepath, 'r') as log_f:
            lines = log_f.readlines()
            found_pval = False
            for line in lines:
                if 'Likelihood ratio test for episodic diversifying positive selection' in line:
                    # Extract the p-value
                    match = re.search(r'p =\s*([\d\.eE+-]+)', line)
                    if match:
                        p_value_str = match.group(1)
                        try:
                            p_value = float(p_value_str)
                            # Store this (protein, p-value)
                            pvals_data.append((protein_name, p_value))
                            found_pval = True
                        except ValueError:
                            print(f"{protein_name}: Could not convert p-value '{p_value_str}' to float")
                    else:
                        print(f"{protein_name}: p-value not found in line: {line.strip()}")
                    break  # Stop after finding the line
            if not found_pval:
                print(f"{protein_name}: 'Likelihood ratio test' line not found or no p-value in {filename}")

# 2. Once we've collected all p-values, apply FDR correction
p_values = [x[1] for x in pvals_data]  # just the numeric p-values

if len(p_values) == 0:
    print("No p-values found. Exiting.")
    exit()

# Benjamini-Hochberg procedure (FDR)
# returns: reject (boolean array), pvals_corrected, alphacSidak, alphacBonf
reject_array, pvals_corrected, _, _ = multipletests(p_values, alpha=0.05, method='fdr_bh')

# 3. Write results to file (only those that pass FDR, i.e., q < 0.05)
with open(output_file, 'w') as out_f:
    for i, (protein_name, original_pval) in enumerate(pvals_data):
        fdr_pval = pvals_corrected[i]
        if reject_array[i]:
            # This means fdr_pval < 0.05
            out_f.write(protein_name + '\n')
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (SIGNIFICANT)")
        else:
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (not significant)")

O95399.1: 'Likelihood ratio test' line not found or no p-value in O95399.1_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
P01210.1: 'Likelihood ratio test' line not found or no p-value in P01210.1_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
P09651.5: 'Likelihood ratio test' line not found or no p-value in P09651.5_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
P13473.2: 'Likelihood ratio test' line not found or no p-value in P13473.2_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
Q12959.2: 'Likelihood ratio test' line not found or no p-value in Q12959.2_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
Q9UPR0.2: 'Likelihood ratio test' line not found or no p-value in Q9UPR0.2_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
NP_001186256.3: raw p = 0.5, FDR p = 0.5 (not significant)
NP_001422887.1: raw p = 0.5, FDR p = 0.5 (not significant)
NP_002515.1: raw p = 0.5, FDR p = 0.5 (not significant)
NP_689867.1: raw p = 0.286, 

In [6]:
#!/usr/bin/env python3

import os
import re
from statsmodels.stats.multitest import multipletests

# Directory containing your .log files
log_dir = 'ampk_busted'

# Output file
output_file = 'ampk_bustede1_FDR.txt'

# We will store all (protein_name, p_value) in a list for later correction
pvals_data = []

# 1. Parse each .log file and extract p-values
for filename in os.listdir(log_dir):
    if filename.endswith('.log'):
        filepath = os.path.join(log_dir, filename)
        # Extract the protein name from the filename
        parts = filename.split('_')
        if parts[0].startswith('NP') or parts[0].startswith('XP'):
            protein_name = '_'.join(parts[:2])
        else:
            protein_name = parts[0]

        # Read the log file
        with open(filepath, 'r') as log_f:
            lines = log_f.readlines()
            found_pval = False
            for line in lines:
                if 'Likelihood ratio test for episodic diversifying positive selection' in line:
                    # Extract the p-value
                    match = re.search(r'p =\s*([\d\.eE+-]+)', line)
                    if match:
                        p_value_str = match.group(1)
                        try:
                            p_value = float(p_value_str)
                            # Store this (protein, p-value)
                            pvals_data.append((protein_name, p_value))
                            found_pval = True
                        except ValueError:
                            print(f"{protein_name}: Could not convert p-value '{p_value_str}' to float")
                    else:
                        print(f"{protein_name}: p-value not found in line: {line.strip()}")
                    break  # Stop after finding the line
            if not found_pval:
                print(f"{protein_name}: 'Likelihood ratio test' line not found or no p-value in {filename}")

# 2. Once we've collected all p-values, apply FDR correction
p_values = [x[1] for x in pvals_data]  # just the numeric p-values

if len(p_values) == 0:
    print("No p-values found. Exiting.")
    exit()

# Benjamini-Hochberg procedure (FDR)
# returns: reject (boolean array), pvals_corrected, alphacSidak, alphacBonf
reject_array, pvals_corrected, _, _ = multipletests(p_values, alpha=0.05, method='fdr_bh')

# 3. Write results to file (only those that pass FDR, i.e., q < 0.05)
with open(output_file, 'w') as out_f:
    for i, (protein_name, original_pval) in enumerate(pvals_data):
        fdr_pval = pvals_corrected[i]
        if reject_array[i]:
            # This means fdr_pval < 0.05
            out_f.write(protein_name + '\n')
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (SIGNIFICANT)")
        else:
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (not significant)")

O75844.2: raw p = 0.5, FDR p = 0.5 (not significant)
P07550.3: raw p = 0.5, FDR p = 0.5 (not significant)
P10644.1: raw p = 0.5, FDR p = 0.5 (not significant)
P13861.2: raw p = 0.5, FDR p = 0.5 (not significant)
P22694.2: raw p = 0.5, FDR p = 0.5 (not significant)
P31323.3: raw p = 0.5, FDR p = 0.5 (not significant)
P54619.1: raw p = 0.5, FDR p = 0.5 (not significant)
Q12802.2: raw p = 0.5, FDR p = 0.5 (not significant)
Q15848.1: raw p = 0.4683, FDR p = 0.5 (not significant)
Q96EB6.2: raw p = 0.5, FDR p = 0.5 (not significant)


In [None]:
#!/usr/bin/env python3

import os
import re
from statsmodels.stats.multitest import multipletests

# Directory containing your .log files
log_dir = 'stem_busted'

# Output file
output_file = 'stem_bustede1_FDR.txt'

# We will store all (protein_name, p_value) in a list for later correction
pvals_data = []

# 1. Parse each .log file and extract p-values
for filename in os.listdir(log_dir):
    if filename.endswith('.log'):
        filepath = os.path.join(log_dir, filename)
        # Extract the protein name from the filename
        parts = filename.split('_')
        if parts[0].startswith('NP') or parts[0].startswith('XP'):
            protein_name = '_'.join(parts[:2])
        else:
            protein_name = parts[0]

        # Read the log file
        with open(filepath, 'r') as log_f:
            lines = log_f.readlines()
            found_pval = False
            for line in lines:
                if 'Likelihood ratio test for episodic diversifying positive selection' in line:
                    # Extract the p-value
                    match = re.search(r'p =\s*([\d\.eE+-]+)', line)
                    if match:
                        p_value_str = match.group(1)
                        try:
                            p_value = float(p_value_str)
                            # Store this (protein, p-value)
                            pvals_data.append((protein_name, p_value))
                            found_pval = True
                        except ValueError:
                            print(f"{protein_name}: Could not convert p-value '{p_value_str}' to float")
                    else:
                        print(f"{protein_name}: p-value not found in line: {line.strip()}")
                    break  # Stop after finding the line
            if not found_pval:
                print(f"{protein_name}: 'Likelihood ratio test' line not found or no p-value in {filename}")

# 2. Once we've collected all p-values, apply FDR correction
p_values = [x[1] for x in pvals_data]  # just the numeric p-values

if len(p_values) == 0:
    print("No p-values found. Exiting.")
    exit()

# Benjamini-Hochberg procedure (FDR)
# returns: reject (boolean array), pvals_corrected, alphacSidak, alphacBonf
reject_array, pvals_corrected, _, _ = multipletests(p_values, alpha=0.05, method='fdr_bh')

# 3. Write results to file (only those that pass FDR, i.e., q < 0.05)
with open(output_file, 'w') as out_f:
    for i, (protein_name, original_pval) in enumerate(pvals_data):
        fdr_pval = pvals_corrected[i]
        if reject_array[i]:
            # This means fdr_pval < 0.05
            out_f.write(protein_name + '\n')
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (SIGNIFICANT)")
        else:
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (not significant)")

NP_976028.2: 'Likelihood ratio test' line not found or no p-value in NP_976028.2_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
P35222.1: 'Likelihood ratio test' line not found or no p-value in P35222.1_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
NP_001269030.1: raw p = 0.5, FDR p = 0.5 (not significant)
NP_001365403.1: raw p = 0.2997, FDR p = 0.5 (not significant)
NP_001371922.1: raw p = 0.5, FDR p = 0.5 (not significant)
NP_001399735.1: raw p = 0.5, FDR p = 0.5 (not significant)
NP_004616.2: raw p = 0.5, FDR p = 0.5 (not significant)
O14905.3: raw p = 0.37, FDR p = 0.5 (not significant)
O43474.3: raw p = 0.4598, FDR p = 0.5 (not significant)
O43524.1: raw p = 0.5, FDR p = 0.5 (not significant)
O75084.2: raw p = 0.4559, FDR p = 0.5 (not significant)
P01023.3: raw p = 0.4794, FDR p = 0.5 (not significant)
P03372.2: raw p = 0.5, FDR p = 0.5 (not significant)
P10415.2: raw p = 0.5, FDR p = 0.5 (not significant)
P12821.1: raw p = 0.5, FDR p = 0.5 (not significant)

In [8]:
#!/usr/bin/env python3

import os
import re
from statsmodels.stats.multitest import multipletests

# Directory containing your .log files
log_dir = 'degrprot_busted'

# Output file
output_file = 'degrprot_bustede1_FDR.txt'

# We will store all (protein_name, p_value) in a list for later correction
pvals_data = []

# 1. Parse each .log file and extract p-values
for filename in os.listdir(log_dir):
    if filename.endswith('.log'):
        filepath = os.path.join(log_dir, filename)
        # Extract the protein name from the filename
        parts = filename.split('_')
        if parts[0].startswith('NP') or parts[0].startswith('XP'):
            protein_name = '_'.join(parts[:2])
        else:
            protein_name = parts[0]

        # Read the log file
        with open(filepath, 'r') as log_f:
            lines = log_f.readlines()
            found_pval = False
            for line in lines:
                if 'Likelihood ratio test for episodic diversifying positive selection' in line:
                    # Extract the p-value
                    match = re.search(r'p =\s*([\d\.eE+-]+)', line)
                    if match:
                        p_value_str = match.group(1)
                        try:
                            p_value = float(p_value_str)
                            # Store this (protein, p-value)
                            pvals_data.append((protein_name, p_value))
                            found_pval = True
                        except ValueError:
                            print(f"{protein_name}: Could not convert p-value '{p_value_str}' to float")
                    else:
                        print(f"{protein_name}: p-value not found in line: {line.strip()}")
                    break  # Stop after finding the line
            if not found_pval:
                print(f"{protein_name}: 'Likelihood ratio test' line not found or no p-value in {filename}")

# 2. Once we've collected all p-values, apply FDR correction
p_values = [x[1] for x in pvals_data]  # just the numeric p-values

if len(p_values) == 0:
    print("No p-values found. Exiting.")
    exit()

# Benjamini-Hochberg procedure (FDR)
# returns: reject (boolean array), pvals_corrected, alphacSidak, alphacBonf
reject_array, pvals_corrected, _, _ = multipletests(p_values, alpha=0.05, method='fdr_bh')

# 3. Write results to file (only those that pass FDR, i.e., q < 0.05)
with open(output_file, 'w') as out_f:
    for i, (protein_name, original_pval) in enumerate(pvals_data):
        fdr_pval = pvals_corrected[i]
        if reject_array[i]:
            # This means fdr_pval < 0.05
            out_f.write(protein_name + '\n')
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (SIGNIFICANT)")
        else:
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (not significant)")

O94782.1: 'Likelihood ratio test' line not found or no p-value in O94782.1_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
P09871.1: 'Likelihood ratio test' line not found or no p-value in P09871.1_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
P17655.6: 'Likelihood ratio test' line not found or no p-value in P17655.6_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
P29144.4: 'Likelihood ratio test' line not found or no p-value in P29144.4_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
P54578.3: 'Likelihood ratio test' line not found or no p-value in P54578.3_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
Q04609.1: 'Likelihood ratio test' line not found or no p-value in Q04609.1_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
Q4J6C6.1: 'Likelihood ratio test' line not found or no p-value in Q4J6C6.1_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
Q92643.2: 'Likelihood ratio test' line not found or no p-value in Q92

In [1]:
#!/usr/bin/env python3

import os
import re
from statsmodels.stats.multitest import multipletests

# Directory containing your .log files
log_dir = 'altmethyl_busted'

# Output file
output_file = 'altmethyl_bustede1_FDR.txt'

# We will store all (protein_name, p_value) in a list for later correction
pvals_data = []

# 1. Parse each .log file and extract p-values
for filename in os.listdir(log_dir):
    if filename.endswith('.log'):
        filepath = os.path.join(log_dir, filename)
        # Extract the protein name from the filename
        parts = filename.split('_')
        if parts[0].startswith('NP') or parts[0].startswith('XP'):
            protein_name = '_'.join(parts[:2])
        else:
            protein_name = parts[0]

        # Read the log file
        with open(filepath, 'r') as log_f:
            lines = log_f.readlines()
            found_pval = False
            for line in lines:
                if 'Likelihood ratio test for episodic diversifying positive selection' in line:
                    # Extract the p-value
                    match = re.search(r'p =\s*([\d\.eE+-]+)', line)
                    if match:
                        p_value_str = match.group(1)
                        try:
                            p_value = float(p_value_str)
                            # Store this (protein, p-value)
                            pvals_data.append((protein_name, p_value))
                            found_pval = True
                        except ValueError:
                            print(f"{protein_name}: Could not convert p-value '{p_value_str}' to float")
                    else:
                        print(f"{protein_name}: p-value not found in line: {line.strip()}")
                    break  # Stop after finding the line
            if not found_pval:
                print(f"{protein_name}: 'Likelihood ratio test' line not found or no p-value in {filename}")

# 2. Once we've collected all p-values, apply FDR correction
p_values = [x[1] for x in pvals_data]  # just the numeric p-values

if len(p_values) == 0:
    print("No p-values found. Exiting.")
    exit()

# Benjamini-Hochberg procedure (FDR)
# returns: reject (boolean array), pvals_corrected, alphacSidak, alphacBonf
reject_array, pvals_corrected, _, _ = multipletests(p_values, alpha=0.05, method='fdr_bh')

# 3. Write results to file (only those that pass FDR, i.e., q < 0.05)
with open(output_file, 'w') as out_f:
    for i, (protein_name, original_pval) in enumerate(pvals_data):
        fdr_pval = pvals_corrected[i]
        if reject_array[i]:
            # This means fdr_pval < 0.05
            out_f.write(protein_name + '\n')
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (SIGNIFICANT)")
        else:
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (not significant)")

Q13129.2: 'Likelihood ratio test' line not found or no p-value in Q13129.2_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
O15297.1: raw p = 0.5, FDR p = 0.5 (not significant)
O60264.1: raw p = 0.5, FDR p = 0.5 (not significant)
O94776.1: raw p = 0.5, FDR p = 0.5 (not significant)
P01100.1: raw p = 0.3754, FDR p = 0.5 (not significant)
P01106.2: raw p = 0.5, FDR p = 0.5 (not significant)
P35226.2: raw p = 0.5, FDR p = 0.5 (not significant)
P42336.2: raw p = 0.5, FDR p = 0.5 (not significant)
P46100.6: raw p = 0.5, FDR p = 0.5 (not significant)
P49840.2: raw p = 0.4991, FDR p = 0.5 (not significant)
Q15910.2: raw p = 0.5, FDR p = 0.5 (not significant)
Q5T5X7.1: raw p = 0.2406, FDR p = 0.5 (not significant)
Q99549.2: raw p = 0.4925, FDR p = 0.5 (not significant)
Q9H9B1.4: raw p = 0, FDR p = 0 (SIGNIFICANT)
Q9NRZ9.1: raw p = 0.5, FDR p = 0.5 (not significant)
Q9UIF9.4: raw p = 0.5, FDR p = 0.5 (not significant)


In [2]:
#!/usr/bin/env python3

import os
import re
from statsmodels.stats.multitest import multipletests

# Directory containing your .log files
log_dir = 'impfold_busted'

# Output file
output_file = 'impfold_bustede1_FDR.txt'

# We will store all (protein_name, p_value) in a list for later correction
pvals_data = []

# 1. Parse each .log file and extract p-values
for filename in os.listdir(log_dir):
    if filename.endswith('.log'):
        filepath = os.path.join(log_dir, filename)
        # Extract the protein name from the filename
        parts = filename.split('_')
        if parts[0].startswith('NP') or parts[0].startswith('XP'):
            protein_name = '_'.join(parts[:2])
        else:
            protein_name = parts[0]

        # Read the log file
        with open(filepath, 'r') as log_f:
            lines = log_f.readlines()
            found_pval = False
            for line in lines:
                if 'Likelihood ratio test for episodic diversifying positive selection' in line:
                    # Extract the p-value
                    match = re.search(r'p =\s*([\d\.eE+-]+)', line)
                    if match:
                        p_value_str = match.group(1)
                        try:
                            p_value = float(p_value_str)
                            # Store this (protein, p-value)
                            pvals_data.append((protein_name, p_value))
                            found_pval = True
                        except ValueError:
                            print(f"{protein_name}: Could not convert p-value '{p_value_str}' to float")
                    else:
                        print(f"{protein_name}: p-value not found in line: {line.strip()}")
                    break  # Stop after finding the line
            if not found_pval:
                print(f"{protein_name}: 'Likelihood ratio test' line not found or no p-value in {filename}")

# 2. Once we've collected all p-values, apply FDR correction
p_values = [x[1] for x in pvals_data]  # just the numeric p-values

if len(p_values) == 0:
    print("No p-values found. Exiting.")
    exit()

# Benjamini-Hochberg procedure (FDR)
# returns: reject (boolean array), pvals_corrected, alphacSidak, alphacBonf
reject_array, pvals_corrected, _, _ = multipletests(p_values, alpha=0.05, method='fdr_bh')

# 3. Write results to file (only those that pass FDR, i.e., q < 0.05)
with open(output_file, 'w') as out_f:
    for i, (protein_name, original_pval) in enumerate(pvals_data):
        fdr_pval = pvals_corrected[i]
        if reject_array[i]:
            # This means fdr_pval < 0.05
            out_f.write(protein_name + '\n')
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (SIGNIFICANT)")
        else:
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (not significant)")

NP_004125.3: 'Likelihood ratio test' line not found or no p-value in NP_004125.3_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
P14625.1: 'Likelihood ratio test' line not found or no p-value in P14625.1_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
P48723.1: 'Likelihood ratio test' line not found or no p-value in P48723.1_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
Q0VDF9.1: 'Likelihood ratio test' line not found or no p-value in Q0VDF9.1_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
O43516.3: raw p = 0.5, FDR p = 0.5 (not significant)
O76031.2: raw p = 0.5, FDR p = 0.5 (not significant)
P07900.5: raw p = 0.5, FDR p = 0.5 (not significant)
P08238.4: raw p = 0.5, FDR p = 0.5 (not significant)
P10809.2: raw p = 0.5, FDR p = 0.5 (not significant)
P10909.1: raw p = 0.5, FDR p = 0.5 (not significant)
P50502.2: raw p = 0.5, FDR p = 0.5 (not significant)
P50542.3: raw p = 0.5, FDR p = 0.5 (not significant)
P54652.1: raw p = 0.5, FDR p = 0.5 (no

In [3]:
#!/usr/bin/env python3

import os
import re
from statsmodels.stats.multitest import multipletests

# Directory containing your .log files
log_dir = 'dismacro_busted'

# Output file
output_file = 'dismacro_bustede1_FDR.txt'

# We will store all (protein_name, p_value) in a list for later correction
pvals_data = []

# 1. Parse each .log file and extract p-values
for filename in os.listdir(log_dir):
    if filename.endswith('.log'):
        filepath = os.path.join(log_dir, filename)
        # Extract the protein name from the filename
        parts = filename.split('_')
        if parts[0].startswith('NP') or parts[0].startswith('XP'):
            protein_name = '_'.join(parts[:2])
        else:
            protein_name = parts[0]

        # Read the log file
        with open(filepath, 'r') as log_f:
            lines = log_f.readlines()
            found_pval = False
            for line in lines:
                if 'Likelihood ratio test for episodic diversifying positive selection' in line:
                    # Extract the p-value
                    match = re.search(r'p =\s*([\d\.eE+-]+)', line)
                    if match:
                        p_value_str = match.group(1)
                        try:
                            p_value = float(p_value_str)
                            # Store this (protein, p-value)
                            pvals_data.append((protein_name, p_value))
                            found_pval = True
                        except ValueError:
                            print(f"{protein_name}: Could not convert p-value '{p_value_str}' to float")
                    else:
                        print(f"{protein_name}: p-value not found in line: {line.strip()}")
                    break  # Stop after finding the line
            if not found_pval:
                print(f"{protein_name}: 'Likelihood ratio test' line not found or no p-value in {filename}")

# 2. Once we've collected all p-values, apply FDR correction
p_values = [x[1] for x in pvals_data]  # just the numeric p-values

if len(p_values) == 0:
    print("No p-values found. Exiting.")
    exit()

# Benjamini-Hochberg procedure (FDR)
# returns: reject (boolean array), pvals_corrected, alphacSidak, alphacBonf
reject_array, pvals_corrected, _, _ = multipletests(p_values, alpha=0.05, method='fdr_bh')

# 3. Write results to file (only those that pass FDR, i.e., q < 0.05)
with open(output_file, 'w') as out_f:
    for i, (protein_name, original_pval) in enumerate(pvals_data):
        fdr_pval = pvals_corrected[i]
        if reject_array[i]:
            # This means fdr_pval < 0.05
            out_f.write(protein_name + '\n')
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (SIGNIFICANT)")
        else:
            print(f"{protein_name}: raw p = {original_pval:.4g}, FDR p = {fdr_pval:.4g} (not significant)")

Q9UKA2.2: 'Likelihood ratio test' line not found or no p-value in Q9UKA2.2_nucleotide.fasta_aligned_codon_alignment.fasta_busted.log
NP_001307809.1: raw p = 0.5, FDR p = 0.5 (not significant)
NP_057658.2: raw p = 0.5, FDR p = 0.5 (not significant)
O15294.3: raw p = 0.5, FDR p = 0.5 (not significant)
P05109.1: raw p = 0.1215, FDR p = 0.5 (not significant)
P06702.1: raw p = 0.5, FDR p = 0.5 (not significant)
P09429.3: raw p = 0.5, FDR p = 0.5 (not significant)
P16615.1: raw p = 0.5, FDR p = 0.5 (not significant)
P42338.1: raw p = 0.3682, FDR p = 0.5 (not significant)
P49768.1: raw p = 0.5, FDR p = 0.5 (not significant)
P55061.2: raw p = 0.4861, FDR p = 0.5 (not significant)
P55072.4: raw p = 0.5, FDR p = 0.5 (not significant)
P62820.3: raw p = 0.4502, FDR p = 0.5 (not significant)
Q12778.2: raw p = 0.5, FDR p = 0.5 (not significant)
Q12983.4: raw p = 0.5, FDR p = 0.5 (not significant)
Q13501.1: raw p = 0.281, FDR p = 0.5 (not significant)
Q14457.2: raw p = 0.4936, FDR p = 0.5 (not signif