# Variant Annotation Notebook

This notebook demonstrates annotation steps on synthetic genomic variant data.

**Note:** This uses synthetic data. For real data, you would use tools like ANNOVAR, VEP, or SnpEff.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
%matplotlib inline

## Load Synthetic Variant Data

In [None]:
# Load variants
variants_df = pd.read_csv('../data/synthetic/variants.csv')
print(f"Loaded {len(variants_df)} variants")
variants_df.head()

## Variant Statistics

In [None]:
# Gene distribution
print("\nVariants per gene:")
print(variants_df['gene'].value_counts())

In [None]:
# Pathogenicity distribution
plt.figure(figsize=(10, 6))
variants_df['pathogenicity'].value_counts().plot(kind='bar')
plt.title('Pathogenicity Distribution')
plt.xlabel('Pathogenicity')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Annotation Process

### Real-World Annotation Steps:

1. **Functional Annotation**
   - Gene name and transcript
   - Variant consequence (missense, nonsense, frameshift, etc.)
   - Amino acid change

2. **Population Frequency**
   - gnomAD allele frequencies
   - 1000 Genomes data
   - ExAC database

3. **Pathogenicity Prediction**
   - CADD scores
   - PolyPhen-2
   - SIFT
   - REVEL

4. **Clinical Databases**
   - ClinVar pathogenicity
   - OMIM associations
   - HGMD mutations

5. **Conservation**
   - PhyloP scores
   - GERP scores

In [None]:
# Simulated annotation: add severity scores
def annotate_severity(row):
    """Assign severity score based on pathogenicity and impact"""
    severity = 0
    
    if row['pathogenicity'] == 'pathogenic':
        severity += 3
    elif row['pathogenicity'] == 'likely_pathogenic':
        severity += 2
    elif row['pathogenicity'] == 'VUS':
        severity += 1
    
    if row['impact'] == 'HIGH':
        severity += 2
    elif row['impact'] == 'MODERATE':
        severity += 1
    
    if row['cadd_score'] > 20:
        severity += 1
    
    return severity

variants_df['severity_score'] = variants_df.apply(annotate_severity, axis=1)
print("\nSeverity score distribution:")
print(variants_df['severity_score'].value_counts().sort_index())

## Filter Clinically Relevant Variants

In [None]:
# Filter for pathogenic/likely pathogenic
pathogenic_variants = variants_df[
    variants_df['pathogenicity'].isin(['pathogenic', 'likely_pathogenic'])
]

print(f"\nFound {len(pathogenic_variants)} pathogenic/likely pathogenic variants")
print(f"Affecting {pathogenic_variants['sample_id'].nunique()} samples")

pathogenic_variants.head()

## Key Hearing Loss Genes

In [None]:
# Focus on key genes
key_genes = ['GJB2', 'SLC26A4', 'OTOF', 'MYO7A']
key_gene_variants = variants_df[variants_df['gene'].isin(key_genes)]

print(f"\nVariants in key hearing loss genes: {len(key_gene_variants)}")

# Plot by gene
plt.figure(figsize=(12, 6))
for gene in key_genes:
    gene_df = variants_df[variants_df['gene'] == gene]
    plt.hist(gene_df['cadd_score'], bins=20, alpha=0.5, label=gene)

plt.xlabel('CADD Score')
plt.ylabel('Count')
plt.title('CADD Score Distribution by Key Gene')
plt.legend()
plt.tight_layout()
plt.show()

## Export Annotated Variants

In [None]:
# Save annotated variants
output_path = '../data/synthetic/annotated_variants.csv'
variants_df.to_csv(output_path, index=False)
print(f"\nSaved annotated variants to {output_path}")

## Summary

This notebook demonstrated:
- Loading synthetic variant data
- Basic annotation steps
- Filtering for clinical relevance
- Focus on key hearing loss genes

For real data, use professional annotation tools and databases.