In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import gzip

def read_mapped_bed(input_file):
    try:
        # Check if input file exists
        if not os.path.exists(input_file):
            raise FileNotFoundError(f"Error: Input file '{input_file}' not found.")
        
        # Read gzipped BED file
        with gzip.open(input_file, 'rt') as f:
            df = pd.read_csv(f, sep='\t', header=None, 
                            usecols=[2, 3, 8, 9], 
                            names=['chr1_start', 'chr1_end', 'frag_start', 'frag_end'])
        
        # Calculate centers
        c1 = (df['chr1_start'] + df['chr1_end']) / 2  # Center of protein binding site
        c2 = (df['frag_start'] + df['frag_end']) / 2  # Center of fragment
        
        # Compute X and Y
        df['X'] = c2 - c1  # Difference between centers
        df['Y'] = df['frag_end'] - df['frag_start']  # Fragment length
        
        # Filter invalid entries
        df = df[df['Y'] >= 0]  # Ensure non-negative fragment lengths
        return df[['X', 'Y']]
    
    except FileNotFoundError as e:
        print(e)
        return None
    except Exception as e:
        print(f"Error processing file: {str(e)}")
        return None

def create_vplot(data, output_file):
    if data is None or data.empty:
        print("No data to plot.")
        return
    
    # Create 2D histogram for X and Y
    x_bins = np.linspace(-1000, 1000, 201)  # Range for X, ~10 bp bins
    y_bins = np.linspace(0, 500, 101)      # Range for Y, ~5 bp bins
    hist, x_edges, y_edges = np.histogram2d(
        data['X'], data['Y'], 
        bins=[x_bins, y_bins]
    )
    
    # Create heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(hist.T,  # Transpose to have Y on vertical axis
                cmap='viridis', 
                norm=plt.Normalize(vmin=0, vmax=np.max(hist)),  # Scale colors
                cbar_kws={'label': 'Fragment Count (Z)'})
    
    # Customize plot
    plt.xlabel('Relative Position (X, bp)')
    plt.ylabel('Fragment Length (Y, bp)')
    plt.title('V-Plot: Fragment Distribution Around Protein Binding Sites')
    
    # Set x and y ticks
    plt.gca().invert_yaxis()  # Flip Y-axis to have 0 at top
    plt.xticks(ticks=np.arange(0, len(x_edges), 20), 
               labels=[int(x_edges[i]) for i in range(0, len(x_edges), 20)], 
               rotation=45)
    plt.yticks(ticks=np.arange(0, len(y_edges), 10), 
               labels=[int(y_edges[i]) for i in range(0, len(y_edges), 10)])
    
    # Save plot
    plt.tight_layout()
    plt.savefig(output_file, dpi=300)
    plt.close()
    print(f"V-plot saved to '{output_file}'.")

def main():
    # File paths
    input_file = 'mapped.bed.gz'
    output_file = 'vplot.png'
    
    # Process data and create plot
    data = read_mapped_bed(input_file)
    create_vplot(data, output_file)

if __name__ == '__main__':
    main()

V-plot saved to 'vplot.png'.
