Gene Expression Data Aggregation

Overview: This notebook aggregates gene expression statistics across multiple samples, producing summary tables for selected metrics. It is designed to work with output data from single-cell RNA sequencing (scRNA-seq) preprocessing.

The workflow involves:
1. Loading each sample file to identify and collect all unique genes across samples.
2. Prompting the user to specify the number of genes to process. The user may choose either all genes or a random subset for testing.
3. Processing each sample in parallel, reindexing to ensure consistent gene representation across samples.
4. Aggregating each gene’s statistics across samples into three tables: sum, mean, and variance, with genes as rows and samples as columns.
5. Saving the aggregated tables as CSV files in the 'aggregated_gene_statistics' output directory for further analysis.

In [32]:
# Required Libraries
import os
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
import random

In [33]:
# Function to load gene statistics data from a CSV file 
def load_gene_statistics(file_path):
    """
    Loads gene statistics from a CSV file into a DataFrame.
    
    Parameters:
    - file_path: str, path to the gene statistics file
    
    Returns:
    - df: DataFrame, with 'Gene' as index and columns for 'Sum', 'Mean', and 'Variance'
    """
    try:
        df = pd.read_csv(file_path, index_col='Gene')
        return df
    except Exception as e:
        print(f"Error loading file {file_path}: {e}")
        return None

In [34]:
# Function to process a single file and update the dictionaries
def process_file(file_path, selected_genes, sample_name):
    """
    Processes a single gene statistics file, ensuring that only selected genes are included.
    
    Parameters:
    - file_path: str, path to the gene statistics file
    - selected_genes: set, set of selected genes to process
    - sample_name: str, name of the sample derived from the file name
    
    Returns:
    - sample_name: str, name of the sample
    - sum_series, mean_series, variance_series: Series, containing Sum, Mean, and Variance values
    """
    df = load_gene_statistics(file_path)
    if df is not None:
        # Reindex to include only selected genes, filling any missing genes with 0
        df = df.reindex(selected_genes, fill_value=0)
        return sample_name, df['Sum'], df['Mean'], df['Variance']
    else:
        print(f"Skipping sample {sample_name} due to loading error.")
        return sample_name, None, None, None

In [35]:
# Main function to aggregate gene statistics across samples
def aggregate_gene_statistics(input_dir, num_genes, num_workers=4):
    sum_dict = {}
    mean_dict = {}
    variance_dict = {}
    all_genes = set()
    sample_order = []  # List to store samples in input order

    # Collect all unique genes across samples
    print("Identifying all unique genes...")
    for file_name in os.listdir(input_dir):
        if file_name.endswith(".csv"):
            file_path = os.path.join(input_dir, file_name)
            print(f"Found CSV file: {file_name}")
            df = load_gene_statistics(file_path)
            if df is not None:
                all_genes.update(df.index)

    # Determine selected genes based on user input
    if num_genes == "all":
        selected_genes = all_genes  # Process all genes
        print(f"Processing all {len(selected_genes)} genes.")
    else:
        selected_genes = set(random.sample(list(all_genes), int(num_genes)))  # Process a subset of genes
        print(f"Processing {num_genes} random genes: {selected_genes}")

    # Process files in parallel
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = []
        for file_name in sorted(os.listdir(input_dir)):  # Sorted to maintain order
            if file_name.endswith(".csv"):
                sample_name = file_name.replace('.csv', '')
                sample_order.append(sample_name)  # Add sample to list in input order
                file_path = os.path.join(input_dir, file_name)
                print(f"Processing sample: {sample_name}")
                futures.append(executor.submit(process_file, file_path, selected_genes, sample_name))
        
        # Collect results as they complete
        for future in as_completed(futures):
            sample_name, sum_series, mean_series, variance_series = future.result()
            if sum_series is not None:
                sum_dict[sample_name] = sum_series
                mean_dict[sample_name] = mean_series
                variance_dict[sample_name] = variance_series

    print(f"Total samples processed: {len(sum_dict)}")

    # Create DataFrames from dictionaries
    sum_df = pd.DataFrame(sum_dict).fillna(0)
    mean_df = pd.DataFrame(mean_dict).fillna(0)
    variance_df = pd.DataFrame(variance_dict).fillna(0)

    # Reorder columns based on sample_order list
    sum_df = sum_df[sample_order]
    mean_df = mean_df[sample_order]
    variance_df = variance_df[sample_order]
    
    return sum_df, mean_df, variance_df

In [36]:
# Main script to aggregate gene statistics and save output
try:
    # Specify the input and output directories
    input_dir = 'gene_statistics'
    output_dir = 'gene_statistics_across_samples'

    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Ask user for the number of genes to process
    num_genes = input("Enter the number of genes to process (type 'all' for all genes): ").strip()

    # Call the aggregation function with parallel processing
    sum_df, mean_df, variance_df = aggregate_gene_statistics(input_dir, num_genes=num_genes, num_workers=8)

    # Save each table to a CSV file in the specified output directory
    sum_df.to_csv(os.path.join(output_dir, 'gene_sums_across_samples.csv'))
    mean_df.to_csv(os.path.join(output_dir, 'gene_means_across_samples.csv'))
    variance_df.to_csv(os.path.join(output_dir, 'gene_variances_across_samples.csv'))

    # Print a success message
    print("Aggregated tables created and saved successfully in the 'gene_statistics_across_samples' folder.")

except Exception as e:
    # Catch any errors during processing
    print(f"An error occurred during processing: {e}")

Identifying all unique genes...
Found CSV file: C3N-02783_1.csv
Found CSV file: C3N-01904_1.csv
Found CSV file: C3N-02188_1.csv
Found CSV file: C3N-03188_1.csv
Found CSV file: C3L-00606_3.csv
Found CSV file: C3L-00606_2.csv
Found CSV file: C3L-00606_1.csv
Found CSV file: C3N-02190_1.csv
Found CSV file: C3N-02784_1.csv
Found CSV file: C3L-02858_1.csv
Found CSV file: C3L-01953_1.csv
Found CSV file: C3N-01175_1.csv
Found CSV file: C3N-00148_1.csv
Found CSV file: C3N-01816_1.csv
Found CSV file: C3L-02705_1.csv
Found CSV file: C3N-01814_1.csv
Found CSV file: C3N-00148_3.csv
Found CSV file: C3N-00148_2.csv
Found CSV file: C3N-02181_1.csv
Found CSV file: C3L-01287_1.csv
Found CSV file: C3N-01270_1.csv
Found CSV file: C3N-00148_4.csv
Found CSV file: C3L-01287_2.csv
Found CSV file: C3N-01798_1.csv
Found CSV file: C3N-01815_1.csv
Found CSV file: C3N-00149_3.csv
Found CSV file: C3L-03405_1.csv
Found CSV file: C3N-03184_1.csv
Found CSV file: C3N-00149_2.csv
Found CSV file: C3N-00149_1.csv
Found CS