# Data Exploration: ClinVar Non-coding Variants

This notebook explores the ClinVar non-coding variant dataset used for benchmarking genomic foundation models.

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

%matplotlib inline
%load_ext autoreload
%autoreload 2

## Load Datasets

In [None]:
# Load positive (pathogenic) variants
positive_df = pd.read_csv('../data/processed/positive_noncoding_annotated.tsv', sep='\t')
print(f"Positive variants: {len(positive_df)}")
positive_df.head()

In [None]:
# Load negative sets
negative_n1_df = pd.read_csv('../data/processed/negative_N1_benign_annotated.tsv', sep='\t')
negative_n3_df = pd.read_csv('../data/processed/negative_N3_matched_random_annotated.tsv', sep='\t')

print(f"Negative N1 (benign): {len(negative_n1_df)}")
print(f"Negative N3 (matched random): {len(negative_n3_df)}")

## Dataset Statistics

In [None]:
# Chromosomal distribution
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for ax, (df, title) in zip(axes, [
    (positive_df, 'Positive (Pathogenic)'),
    (negative_n1_df, 'Negative N1 (Benign)'),
    (negative_n3_df, 'Negative N3 (Matched Random)')
]):
    chrom_counts = df['chrom'].value_counts().sort_index()
    chrom_counts.plot(kind='bar', ax=ax)
    ax.set_title(title)
    ax.set_xlabel('Chromosome')
    ax.set_ylabel('Count')
    ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Region category distribution
if 'region_category' in positive_df.columns:
    fig, ax = plt.subplots(figsize=(10, 6))
    
    region_counts = positive_df['region_category'].value_counts()
    region_counts.plot(kind='barh', ax=ax, color='steelblue')
    ax.set_title('Distribution of Pathogenic Variants by Non-coding Region')
    ax.set_xlabel('Number of Variants')
    ax.set_ylabel('Region Category')
    
    plt.tight_layout()
    plt.show()
    
    print("\nRegion distribution:")
    print(region_counts)

## Allele Frequency Distribution

In [None]:
# Reference and alternate allele distributions
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Reference alleles
ref_counts = positive_df['ref'].value_counts()
ref_counts.plot(kind='bar', ax=axes[0], color='coral')
axes[0].set_title('Reference Allele Distribution')
axes[0].set_xlabel('Nucleotide')
axes[0].set_ylabel('Count')

# Alternate alleles
alt_counts = positive_df['alt'].value_counts()
alt_counts.plot(kind='bar', ax=axes[1], color='lightblue')
axes[1].set_title('Alternate Allele Distribution')
axes[1].set_xlabel('Nucleotide')
axes[1].set_ylabel('Count')

plt.tight_layout()
plt.show()

## Compare Positive vs Negative Sets

In [None]:
# Create combined dataset for comparison
if 'region_category' in positive_df.columns and 'region_category' in negative_n3_df.columns:
    combined = []
    
    for df, label in [(positive_df, 'Pathogenic'), (negative_n3_df, 'Matched Random')]:
        temp = df[['region_category']].copy()
        temp['dataset'] = label
        combined.append(temp)
    
    combined_df = pd.concat(combined, ignore_index=True)
    
    # Plot grouped bar chart
    fig, ax = plt.subplots(figsize=(12, 6))
    
    region_comparison = pd.crosstab(combined_df['region_category'], combined_df['dataset'])
    region_comparison.plot(kind='bar', ax=ax)
    
    ax.set_title('Region Distribution: Pathogenic vs Matched Random')
    ax.set_xlabel('Region Category')
    ax.set_ylabel('Count')
    ax.legend(title='Dataset')
    plt.xticks(rotation=45, ha='right')
    
    plt.tight_layout()
    plt.show()

## Summary Statistics

In [None]:
print("=" * 60)
print("DATASET SUMMARY")
print("=" * 60)
print(f"\nPositive (pathogenic): {len(positive_df):,}")
print(f"Negative N1 (benign):  {len(negative_n1_df):,}")
print(f"Negative N3 (matched): {len(negative_n3_df):,}")

if 'region_category' in positive_df.columns:
    print(f"\nUnique region categories: {positive_df['region_category'].nunique()}")
    print(f"Unique chromosomes: {positive_df['chrom'].nunique()}")

print("=" * 60)