In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os, lzma
from scipy.stats import pmean

In [2]:
plt.rcParams.update({
    'figure.autolayout': True,
    'savefig.bbox': 'tight',
    'svg.fonttype': 'none',
    'font.sans-serif': 'Arial',
    'font.size': 12,
    'boxplot.medianprops.linewidth': 2,
    'boxplot.flierprops.markeredgecolor': 'none',
    'boxplot.flierprops.markersize': 5})

Functions

In [3]:
def adjMatv2(edges_ogs, edges_genomes):
    # Get unique elements
    ogs, ogs_indices = np.unique(edges_ogs, return_inverse = True)
    genomes, genomes_indices = np.unique(edges_genomes, return_inverse = True)
    # Calculate bin counts for each combination of indices
    counts = np.bincount(ogs_indices * len(genomes) + genomes_indices, minlength = len(ogs) * len(genomes))
    # Reshape counts as adjacency matrix
    adj = counts.reshape(len(ogs), len(genomes))
    return adj, genomes, ogs

In [4]:
def save_genes(genes, fOut):
    with open(fOut, 'w') as f:
        for gene in genes:
            f.write(f'{gene}\n')

In [5]:
def remove_genes(adj_mod):
    remove = np.array([i for i in range(len(adj_mod)) if adj_mod[i].sum() < 4])
    return remove

In [6]:
def greedy_power_mean_sample_mask(data, k, p, pseudocount):
    """Iteratively select rows from a matrix such that the selection criterion by
    column is maximized.

    Parameters
    ----------
    data : ndarray of shape (2D)
        Input data matrix (gene by species).
    k : int
        Number of rows (genes) to select.
    p : float or str
        Exponent of generalized mean, or special values: "min" or "max".
    pseudocount : float, optional
        Pseudocount to add to each cell value.
    min_universality_genes : int, optional
        Minimum universality threshold.

    Returns
    -------
    ndarray (1D)
        Row indices selected in order.

    """
    
    # numbers of genes (n) and species (m)
    n, m = data.shape

    # matrix is empty
    if n == 0 or m == 0:
        raise ValueError('Matrix is empty!')

    # matrix contains only zeroes
    if (data == 0).all():
        raise ValueError('Matrix only contains 0\'s')

    if k >= n:
        raise ValueError(f'k should be smaller than {n}')

    # Add pseudocount
    data = data.astype(np.float64, copy = False) + pseudocount

    # cumulative gene counts
    counts = np.zeros(m, dtype = np.float64)

    # gene indices in original data matrix
    all_original_indices = np.arange(n)
    # Boolean mask to keep track of available genes
    available_row_mask = np.ones(n, dtype = bool)

    # indices of selected genes
    selected = np.empty(k, dtype = int)

    # iteratively select k genes
    for i in range(k):

        # Create new array containing only available rows
        current_data_slice = data[available_row_mask, :]
        # calculate counts after adding each gene
        sums_ = counts + current_data_slice

        # select a gene that maximizes the power mean gene count per species, using the cumulative matrix
        if isinstance(p, int) or isinstance(p, np.int64): 
            choice = pmean(sums_, int(p), axis = 1).argmax()
        elif p == 'min':
            choice = sums_.min(axis = 1).argmax()
        elif p == 'max':
            choice = sums_.max(axis = 1).argmax()
        else:
            raise ValueError(f'Invalid p: {p}.')

        # append index of selected gene
        original_indices_of_available_rows = all_original_indices[available_row_mask]
        chosen_original_idx = original_indices_of_available_rows[choice]
        selected[i] = chosen_original_idx

        # update per-species gene counts
        counts = sums_[choice, :]

        # Update mask
        available_row_mask[chosen_original_idx] = False

    return selected

In [7]:
def load_annotations_eggnog(file, genomes_sampled):
    tmp = []
    with lzma.open(file, mode = 'rt') as f:
        for i, line in enumerate(f):
            row = line.strip().split('\t')
            # orf, ko, score = row[0], row[1], row[3]
            orf, score, og = row[0], row[3], row[4].split('|')[0]
            genome = orf.split('_')[0]
            if genome in genomes_sampled:
                tmp.append([orf, float(score), og.split('@')[0], genome])
                
    df_eggnog = pd.DataFrame(tmp, columns = ['orf', 'score', 'og', 'genome'])
    df_eggnog.set_index('orf', inplace = True)
    return df_eggnog

In [8]:
def load_annotations_kegg(file, genomes_sampled):
    tmp = []
    with lzma.open(file, mode = 'rt') as f:
        for i, line in enumerate(f):
            row = line.strip().split('\t')
            orf, ko, score = row[0], row[1], row[3]
            genome = orf.split('_')[0]
            if genome in genomes_sampled:
                tmp.append([orf, float(score), ko, genome])
                
    df_kegg = pd.DataFrame(tmp, columns = ['orf', 'score', 'ko', 'genome'])
    df_kegg.set_index('orf', inplace = True)
    return df_kegg

Select markers

In [9]:
replicates = np.arange(10)
ks = [50, 100, 200, 400]
p = 0
dataPathIn = f'./out'
file_annotations_eggnog = f'../markers_selection/input_data/wol2/emapper.annotations.xz'
file_annotations_kegg = f'../markers_selection/input_data/wol2/kofamscan.tsv.xz'

In [10]:
for rep in replicates:
    print(f'At replicate {rep}')
    genomes_sampled = np.loadtxt(f'{dataPathIn}/replicate_{rep}/genomes/genomes.txt', dtype = str)
    df_eggnog = load_annotations_eggnog(file_annotations_eggnog, genomes_sampled)
    # For each genome and gene, discard those genes with a score less than `x` of the top `score`
    threshold = 1.0
    # Calculate the maximum score for each genome and pfam combination
    max_scores = df_eggnog.groupby(['genome', 'og'])['score'].transform('max')
    # Keep rows where the score is greater than or equal to max_score * threshold
    filtered_df = df_eggnog[df_eggnog['score'] >= max_scores * threshold]
    # Convert to edges
    edges_genomes = filtered_df['genome'].values
    edges_ogs = filtered_df['og'].values
    # Build copy number matrix
    adj, genomes, ogs = adjMatv2(edges_ogs, edges_genomes)
    print(f'\tCopy number matrix shape: {adj.shape}')
    # Reformat matrix
    adj1 = adj
    adj2 = adj1.copy()
    adj1[adj1 > 1] = 1
    # Remove genes with less than 4 genomes
    # Copy presence/absence matrix
    adj_mod = adj1.copy()
    # Select genes to remove
    remove = remove_genes(adj_mod)
    # Remove genes
    adj_mod = np.delete(adj_mod, remove, axis = 0)
    # Remove genes from list
    ogs_mod = np.delete(ogs, remove, axis = 0)
    # Copy matrix
    adj_mod2 = adj2.copy()
    # Remove genes
    adj_mod2 = np.delete(adj_mod2, remove, axis = 0)
    print(f'\tCopy number matrix after filtering: {adj_mod2.shape}')
    # Select markers
    selected_genes_mod = {}
    for k in ks:
        select = greedy_power_mean_sample_mask(adj_mod2, k = k, p = p, pseudocount = 0.1)
        selected_genes_mod[k] = select
    # Save markers
    os.makedirs(f'{dataPathIn}/replicate_{rep}/markers/eggnog', exist_ok = True)
    for k, markers in selected_genes_mod.items():
        save_genes(ogs_mod[markers], f'{dataPathIn}/replicate_{rep}/markers/eggnog/select_k_{k}_p_{p}.txt')
    # Save ORFs
    os.makedirs(f'{dataPathIn}/replicate_{rep}/orfs/eggnog', exist_ok = True)
    # Because smaller sets are nested in larger sets of selected genes, we only need to save the largest set
    for i in selected_genes_mod[ks[-1]]:
        gene = ogs_mod[i]
        with open(f'{dataPathIn}/replicate_{rep}/orfs/eggnog/{gene}.txt', 'w') as f:
            orfs = filtered_df.query('og == @gene').index
            for orf in orfs:
                f.write(f'{orf}\n')
    

At replicate 0
	Copy number matrix shape: (32736, 568)
	Copy number matrix after filtering: (10576, 568)
At replicate 1
	Copy number matrix shape: (37283, 569)
	Copy number matrix after filtering: (11594, 569)
At replicate 2
	Copy number matrix shape: (21263, 567)
	Copy number matrix after filtering: (6746, 567)
At replicate 3
	Copy number matrix shape: (46255, 566)
	Copy number matrix after filtering: (16343, 566)
At replicate 4
	Copy number matrix shape: (27550, 567)
	Copy number matrix after filtering: (9462, 567)
At replicate 5
	Copy number matrix shape: (25034, 568)
	Copy number matrix after filtering: (10182, 568)
At replicate 6
	Copy number matrix shape: (50106, 568)
	Copy number matrix after filtering: (16570, 568)
At replicate 7
	Copy number matrix shape: (37750, 567)
	Copy number matrix after filtering: (12872, 567)
At replicate 8
	Copy number matrix shape: (40651, 568)
	Copy number matrix after filtering: (12770, 568)
At replicate 9
	Copy number matrix shape: (42685, 568)
	C

In [11]:
for rep in replicates:
    print(f'At replicate {rep}')
    genomes_sampled = np.loadtxt(f'{dataPathIn}/replicate_{rep}/genomes/genomes.txt', dtype = str)
    df_kegg = load_annotations_kegg(file_annotations_kegg, genomes_sampled)
    # For each genome and gene, discard those genes with a score less than `x` of the top `score`
    threshold = 1.0
    # Calculate the maximum score for each genome and pfam combination
    max_scores = df_kegg.groupby(['genome', 'ko'])['score'].transform('max')
    # Keep rows where the score is greater than or equal to max_score * threshold
    filtered_df = df_kegg[df_kegg['score'] >= max_scores * threshold]
    # Convert to edges
    edges_genomes = filtered_df['genome'].values
    edges_ogs = filtered_df['ko'].values
    # Build copy number matrix
    adj, genomes, ogs = adjMatv2(edges_ogs, edges_genomes)
    print(f'\tCopy number matrix shape: {adj.shape}')
    # Reformat matrix
    adj1 = adj
    adj2 = adj1.copy()
    adj1[adj1 > 1] = 1
    # Remove genes with less than 4 genomes
    # Copy presence/absence matrix
    adj_mod = adj1.copy()
    # Select genes to remove
    remove = remove_genes(adj_mod)
    # Remove genes
    adj_mod = np.delete(adj_mod, remove, axis = 0)
    # Remove genes from list
    ogs_mod = np.delete(ogs, remove, axis = 0)
    # Copy matrix
    adj_mod2 = adj2.copy()
    # Remove genes
    adj_mod2 = np.delete(adj_mod2, remove, axis = 0)
    print(f'\tCopy number matrix after filtering: {adj_mod2.shape}')
    # Select markers
    selected_genes_mod = {}
    for k in ks:
        select = greedy_power_mean_sample_mask(adj_mod2, k = k, p = p, pseudocount = 0.1)
        selected_genes_mod[k] = select
    # Save markers
    os.makedirs(f'{dataPathIn}/replicate_{rep}/markers/kegg', exist_ok = True)
    for k, markers in selected_genes_mod.items():
        save_genes(ogs_mod[markers], f'{dataPathIn}/replicate_{rep}/markers/kegg/select_k_{k}_p_{p}.txt')
    # Save ORFs
    os.makedirs(f'{dataPathIn}/replicate_{rep}/orfs/kegg', exist_ok = True)
    # Because smaller sets are nested in larger sets of selected genes, we only need to save the largest set
    for i in selected_genes_mod[ks[-1]]:
        gene = ogs_mod[i]
        with open(f'{dataPathIn}/replicate_{rep}/orfs/kegg/{gene}.txt', 'w') as f:
            orfs = filtered_df.query('ko == @gene').index
            for orf in orfs:
                f.write(f'{orf}\n')
    

At replicate 0
	Copy number matrix shape: (7532, 568)
	Copy number matrix after filtering: (6001, 568)
At replicate 1
	Copy number matrix shape: (7065, 569)
	Copy number matrix after filtering: (5854, 569)
At replicate 2
	Copy number matrix shape: (5856, 567)
	Copy number matrix after filtering: (4343, 567)
At replicate 3
	Copy number matrix shape: (7150, 566)
	Copy number matrix after filtering: (6051, 566)
At replicate 4
	Copy number matrix shape: (6629, 567)
	Copy number matrix after filtering: (5196, 567)
At replicate 5
	Copy number matrix shape: (6322, 568)
	Copy number matrix after filtering: (4961, 568)
At replicate 6
	Copy number matrix shape: (7204, 568)
	Copy number matrix after filtering: (5953, 568)
At replicate 7
	Copy number matrix shape: (6606, 567)
	Copy number matrix after filtering: (5464, 567)
At replicate 8
	Copy number matrix shape: (7306, 568)
	Copy number matrix after filtering: (6072, 568)
At replicate 9
	Copy number matrix shape: (6918, 568)
	Copy number matrix