# CAFA-6 Baseline EDA

Exploratory analysis of the protein function prediction dataset.

In [1]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Add project root to path
sys.path.insert(0, str(Path.cwd().parent / '05_model'))

from src.data.prepare import load_sequences, load_terms, load_taxonomy, load_ia_weights, load_go

%matplotlib inline
sns.set_style('whitegrid')

ModuleNotFoundError: No module named 'seaborn'

## Load Data

In [None]:
data_root = Path.cwd().parent / 'cafa-6-protein-function-prediction'
train_root = data_root / 'Train'

# Load all files
sequences = load_sequences(train_root / 'train_sequences.fasta')
terms_df = load_terms(train_root / 'train_terms.tsv')
taxonomy_df = load_taxonomy(train_root / 'train_taxonomy.tsv')
ia_weights = load_ia_weights(data_root / 'IA.tsv')
go_graph = load_go(train_root / 'go-basic.obo')

print(f"Sequences: {len(sequences):,}")
print(f"Term annotations: {len(terms_df):,}")
print(f"Taxonomy entries: {len(taxonomy_df):,}")
print(f"IA weights: {len(ia_weights):,}")
print(f"GO graph nodes: {len(go_graph.nodes):,}")

## Sequence Statistics

In [None]:
seq_lengths = [len(seq) for seq in sequences.values()]

fig, axes = plt.subplots(1, 2, figsize=(14, 4))

# Histogram
axes[0].hist(seq_lengths, bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Sequence Length')
axes[0].set_ylabel('Count')
axes[0].set_title('Distribution of Protein Sequence Lengths')

# Box plot
axes[1].boxplot(seq_lengths, vert=True)
axes[1].set_ylabel('Sequence Length')
axes[1].set_title('Sequence Length Distribution')

plt.tight_layout()
plt.show()

print(f"Min length: {min(seq_lengths)}")
print(f"Max length: {max(seq_lengths)}")
print(f"Mean length: {np.mean(seq_lengths):.1f}")
print(f"Median length: {np.median(seq_lengths):.1f}")

## Term Distribution by Ontology

In [None]:
ontology_counts = terms_df['namespace'].value_counts()

fig, ax = plt.subplots(figsize=(8, 5))
ontology_counts.plot(kind='bar', ax=ax, edgecolor='black', alpha=0.7)
ax.set_xlabel('Ontology')
ax.set_ylabel('Number of Annotations')
ax.set_title('Term Annotations by Ontology')
ax.set_xticklabels(ax.get_xticklabels(), rotation=0)
plt.tight_layout()
plt.show()

print("\nAnnotations per ontology:")
print(ontology_counts)

## Terms per Protein

In [None]:
terms_per_protein = terms_df.groupby('protein_id').size()

fig, ax = plt.subplots(figsize=(10, 5))
ax.hist(terms_per_protein, bins=50, edgecolor='black', alpha=0.7)
ax.set_xlabel('Number of Terms per Protein')
ax.set_ylabel('Count')
ax.set_title('Distribution of Term Annotations per Protein')
plt.tight_layout()
plt.show()

print(f"Min terms per protein: {terms_per_protein.min()}")
print(f"Max terms per protein: {terms_per_protein.max()}")
print(f"Mean terms per protein: {terms_per_protein.mean():.1f}")
print(f"Median terms per protein: {terms_per_protein.median():.1f}")

## Most Frequent Terms

In [None]:
top_terms = terms_df['term_id'].value_counts().head(20)

fig, ax = plt.subplots(figsize=(10, 6))
top_terms.plot(kind='barh', ax=ax, edgecolor='black', alpha=0.7)
ax.set_xlabel('Number of Proteins')
ax.set_ylabel('GO Term ID')
ax.set_title('Top 20 Most Frequent GO Terms')
plt.tight_layout()
plt.show()

print("\nTop 10 most frequent terms:")
print(top_terms.head(10))

## Information Accretion Weights

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))
ax.hist(ia_weights.values, bins=50, edgecolor='black', alpha=0.7)
ax.set_xlabel('Information Accretion Weight')
ax.set_ylabel('Count')
ax.set_title('Distribution of IA Weights Across GO Terms')
plt.tight_layout()
plt.show()

print(f"Min IA weight: {ia_weights.min():.3f}")
print(f"Max IA weight: {ia_weights.max():.3f}")
print(f"Mean IA weight: {ia_weights.mean():.3f}")
print(f"Median IA weight: {ia_weights.median():.3f}")

## Taxonomy Distribution

In [None]:
top_taxa = taxonomy_df['taxon_id'].value_counts().head(15)

fig, ax = plt.subplots(figsize=(10, 6))
top_taxa.plot(kind='barh', ax=ax, edgecolor='black', alpha=0.7)
ax.set_xlabel('Number of Proteins')
ax.set_ylabel('Taxon ID')
ax.set_title('Top 15 Most Represented Taxa')
plt.tight_layout()
plt.show()

print(f"\nTotal unique taxa: {taxonomy_df['taxon_id'].nunique()}")
print("\nTop 10 taxa:")
print(top_taxa.head(10))