In [2]:
import pandas as pd
import glob
import os

def extract_sample_name(filename):
    """Extract the SRR ID from the filename."""
    base = os.path.basename(filename)
    # Extract everything before 'coord_sorted'
    return base.split('coord_sorted')[0]

def get_global_metrics(df):
    """Extract global metrics from the dataframe."""
    global_metrics = {}
    for index, row in df.iterrows():
        if '_global' in row['Metric']:
            # Remove '_global' from the metric name to make cleaner column headers
            metric_name = row['Metric'].replace('_global', '')
            global_metrics[metric_name] = row['Score']
    return global_metrics

# Create an empty list to store results
results = []

# Get all CSV files in the directory
csv_files = glob.glob('/data1/Jack/projects/riboseqorg-nf/data/Processed/RiboMetric/*RiboMetric.csv')

print(csv_files)
# Process each file
for csv_file in csv_files:        # Read the CSV file
    df = pd.read_csv(csv_file)
    
    # Get sample name
    sample_name = extract_sample_name(csv_file)
    
    # Get global metrics
    global_metrics = get_global_metrics(df)
    
    # Add sample name to metrics
    global_metrics['Sample'] = sample_name
    
    # Append to results
    results.append(global_metrics)
        
# Create final dataframe
final_df = pd.DataFrame(results)

# Set Sample as index
final_df.set_index('Sample', inplace=True)

# Sort index for better organization
final_df.sort_index(inplace=True)

# Save to CSV
final_df.to_csv('global_metrics_summary.csv')

# Display first few rows of the dataframe
print("\nFirst few rows of the processed data:")
print(final_df.head())

# Display shape of the dataframe
print(f"\nTotal number of samples: {final_df.shape[0]}")
print(f"Total number of metrics: {final_df.shape[1]}")

['/data1/Jack/projects/riboseqorg-nf/data/Processed/RiboMetric/SRR11005887coord_sorted_RiboMetric.csv', '/data1/Jack/projects/riboseqorg-nf/data/Processed/RiboMetric/SRR7241913coord_sorted_RiboMetric.csv', '/data1/Jack/projects/riboseqorg-nf/data/Processed/RiboMetric/SRR6337304coord_sorted_RiboMetric.csv', '/data1/Jack/projects/riboseqorg-nf/data/Processed/RiboMetric/SRR8197271coord_sorted_RiboMetric.csv', '/data1/Jack/projects/riboseqorg-nf/data/Processed/RiboMetric/SRR1585523coord_sorted_RiboMetric.csv', '/data1/Jack/projects/riboseqorg-nf/data/Processed/RiboMetric/SRR13884549coord_sorted_RiboMetric.csv', '/data1/Jack/projects/riboseqorg-nf/data/Processed/RiboMetric/SRR11294609coord_sorted_RiboMetric.csv', '/data1/Jack/projects/riboseqorg-nf/data/Processed/RiboMetric/DRR277031coord_sorted_RiboMetric.csv', '/data1/Jack/projects/riboseqorg-nf/data/Processed/RiboMetric/SRR10868252coord_sorted_RiboMetric.csv', '/data1/Jack/projects/riboseqorg-nf/data/Processed/RiboMetric/SRR10957156coord

In [5]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

# Read the data
df = pd.read_csv('global_metrics_summary.csv', index_col='Sample')

df.drop(columns="uniformity_theil_index")
# Set up the style
sns.set_palette("husl")

# 1. Distribution Plots
def plot_distributions(df):
    """Create distribution plots for all metrics"""
    n_metrics = df.shape[1]
    n_cols = 3
    n_rows = (n_metrics + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4*n_rows))
    axes = axes.flatten()
    
    for i, column in enumerate(df.columns):
        if column in ['uniformity_theil_index']:
            continue
        sns.histplot(data=df, x=column, kde=True, ax=axes[i])
        axes[i].set_title(f'Distribution of {column}')
        axes[i].tick_params(axis='x', rotation=45)
    
    # Remove empty subplots if any
    for j in range(i+1, len(axes)):
        fig.delaxes(axes[j])
    
    plt.tight_layout()
    plt.savefig('metric_distributions.png')
    plt.close()

# 2. Correlation Heatmap
def plot_correlation_heatmap(df):
    """Create a correlation heatmap"""
    plt.figure(figsize=(12, 10))
    
    # Calculate correlations
    corr = df.corr()
    
    # Create mask for upper triangle
    mask = np.triu(np.ones_like(corr, dtype=bool))
    
    # Create heatmap
    sns.heatmap(corr, 
                mask=mask,
                cmap='RdBu_r',
                vmin=-1, 
                vmax=1, 
                center=0,
                square=True,
                annot=True,
                fmt='.2f',
                cbar_kws={"shrink": .5})
    
    plt.title('Correlation between Metrics')
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig('correlation_heatmap.png')
    plt.close()

# 3. Pairplot for selected metrics
def plot_selected_pairs(df):
    """Create pairplot for selected important metrics"""
    # Select metrics that are likely most important
    selected_metrics = [
        'prop_reads_CDS',
        'prop_reads_leader',
        'prop_reads_trailer',
        'periodicity_dominance',
        'periodicity_trips-viz'
    ]
    
    sns.pairplot(df[selected_metrics], 
                 diag_kind='kde',
                 plot_kws={'alpha': 0.6})
    plt.tight_layout()
    plt.savefig('selected_pairs_plot.png')
    plt.close()

# 4. Box plots
def plot_boxplots(df):
    """Create box plots for all metrics"""
    plt.figure(figsize=(15, 6))
    
    # Melt the dataframe for boxplot
    df_melted = df.melt()
    
    sns.boxplot(x='variable', y='value', data=df_melted)
    plt.xticks(rotation=90)
    plt.xlabel('Metrics')
    plt.ylabel('Values')
    plt.title('Distribution of Values Across Metrics')
    plt.tight_layout()
    plt.savefig('metric_boxplots.png')
    plt.close()

# 5. Scatter plot matrix of related metrics
def plot_related_metrics(df):
    """Create scatter plots for related metric groups"""
    # Define groups of related metrics
    metric_groups = {
        'Proportion Metrics': ['prop_reads_CDS', 'prop_reads_leader', 'prop_reads_trailer'],
        'Ratio Metrics': ['ratio_cds:leader', 'ratio_cds:trailer', 'ratio_leader:trailer'],
        'Periodicity Metrics': ['periodicity_autocorrelation', 'periodicity_fourier', 
                              'periodicity_information', 'periodicity_trips-viz', 'periodicity_dominance'],
        'Uniformity Metrics': ['uniformity_autocorrelation', 'uniformity_entropy', 
                             'uniformity_theil_index', 'uniformity_gini_index']
    }
    
    for group_name, metrics in metric_groups.items():
        if len(metrics) > 1:  # Only plot if there are at least 2 metrics in the group
            g = sns.PairGrid(df[metrics])
            g.map_upper(sns.scatterplot, alpha=0.4)
            g.map_lower(sns.kdeplot)
            g.map_diag(sns.histplot, kde=True)
            plt.suptitle(f'{group_name} Relationships', y=1.02)
            plt.tight_layout()
            plt.savefig(f'{group_name.lower().replace(" ", "_")}_relationships.png')
            plt.close()

# Create summary statistics
def generate_summary_stats(df):
    """Generate summary statistics for all metrics"""
    summary_stats = df.describe()
    summary_stats.to_csv('metric_summary_statistics.csv')
    
    # Calculate skewness and kurtosis
    skew_kurt = pd.DataFrame({
        'Skewness': df.skew(),
        'Kurtosis': df.kurtosis()
    })
    skew_kurt.to_csv('metric_skew_kurtosis.csv')

# Run all visualizations


In [6]:
print("Generating visualizations...")
plot_distributions(df)
print("✓ Distribution plots generated")

plot_correlation_heatmap(df)
print("✓ Correlation heatmap generated")

plot_selected_pairs(df)
print("✓ Selected pairs plot generated")

plot_boxplots(df)
print("✓ Box plots generated")

plot_related_metrics(df)
print("✓ Related metrics plots generated")

generate_summary_stats(df)
print("✓ Summary statistics generated")

print("\nAll visualizations have been saved as PNG files.")

Generating visualizations...
✓ Distribution plots generated
✓ Correlation heatmap generated
✓ Selected pairs plot generated
✓ Box plots generated


: 