In [None]:
# Feature Engineering for Protein Function Classifier

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

sys.path.append('..')

from src.features import (
    create_feature_dataframe,
    get_feature_names,
    compute_amino_acid_composition,
    compute_dipeptide_composition,
    compute_physicochemical_properties
)

plt.style.use('seaborn-v0_8-whitegrid')
print("Imports complete!")

In [None]:
# Load the cleaned dataset
df = pd.read_csv('../data/processed/cleaned_enzymes.csv')
print(f"Loaded {len(df)} sequences")

# Extract all features
feature_df = create_feature_dataframe(df, sequence_column='sequence')

# Add the target variable
feature_df['ec_class'] = df['ec_class'].values

print(f"\nFeature matrix shape: {feature_df.shape}")
feature_df.head()

In [None]:
# Check for any issues in features
print("Feature Statistics:")

# Check for NaN values
nan_counts = feature_df.isnull().sum().sum()
print(f"Total NaN values: {nan_counts}")

# Check for infinite values
inf_counts = np.isinf(feature_df.select_dtypes(include=[np.number])).sum().sum()
print(f"Total infinite values: {inf_counts}")

# Basic statistics
print(f"\nFeature value ranges:")
numeric_cols = feature_df.select_dtypes(include=[np.number]).columns
print(f"  Min value: {feature_df[numeric_cols].min().min():.4f}")
print(f"  Max value: {feature_df[numeric_cols].max().max():.4f}")

# Display summary of different feature groups
print("\n\nFeature Groups Summary:")
print("-" * 50)

aac_cols = [c for c in feature_df.columns if c.startswith('AAC_')]
dpc_cols = [c for c in feature_df.columns if c.startswith('DPC_')]
other_cols = [c for c in numeric_cols if not c.startswith('AAC_') and not c.startswith('DPC_') and c != 'ec_class']

print(f"Amino Acid Composition (AAC): {len(aac_cols)} features")
print(f"Dipeptide Composition (DPC): {len(dpc_cols)} features")
print(f"Other features: {len(other_cols)} features")

In [None]:
# Analyze physicochemical features correlation with target
physchem_features = [
    'avg_hydrophobicity', 'std_hydrophobicity',
    'avg_molecular_weight', 'total_molecular_weight',
    'frac_polar', 'frac_nonpolar',
    'net_charge', 'frac_positive', 'frac_negative',
    'log_length', 'avg_helix_prop', 'std_helix_prop',
    'avg_sheet_prop', 'std_sheet_prop',
    'frac_unique_aa', 'sequence_entropy', 'dipeptide_repetitiveness'
]

# Create correlation heatmap for physicochemical features
plt.figure(figsize=(12, 10))
physchem_df = feature_df[physchem_features + ['ec_class']]
corr_matrix = physchem_df.corr()

sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=0.5)
plt.title('Correlation Matrix: Physicochemical Features', fontsize=14)
plt.tight_layout()
plt.savefig('../figures/physicochemical_correlation.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Box Plot - Clear comparison between classes
key_features = ['avg_hydrophobicity', 'frac_polar', 'log_length', 
                'sequence_entropy', 'avg_helix_prop', 'avg_sheet_prop']

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

colors = sns.color_palette("husl", 7)

for idx, feature in enumerate(key_features):
    ax = axes[idx]
    
    # Create box plot
    sns.boxplot(x='ec_class', y=feature, data=feature_df, ax=ax, palette=colors)
    
    ax.set_xlabel('EC Class')
    ax.set_ylabel(feature)
    ax.set_title(f'Distribution of {feature}')

plt.tight_layout()
plt.savefig('../figures/feature_distributions_boxplot.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Violin Plot - Shows distribution shape + summary stats
key_features = ['avg_hydrophobicity', 'frac_polar', 'log_length', 
                'sequence_entropy', 'avg_helix_prop', 'avg_sheet_prop']

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

colors = sns.color_palette("husl", 7)

for idx, feature in enumerate(key_features):
    ax = axes[idx]
    
    # Create violin plot
    sns.violinplot(x='ec_class', y=feature, data=feature_df, ax=ax, palette=colors)
    
    ax.set_xlabel('EC Class')
    ax.set_ylabel(feature)
    ax.set_title(f'Distribution of {feature}')

plt.tight_layout()
plt.savefig('../figures/feature_distributions_violin.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Save the complete feature set
feature_df.to_csv('../data/processed/features_complete.csv', index=False)
print(f" Saved complete features to data/processed/features_complete.csv")
print(f"  Shape: {feature_df.shape}")

# Also save just the feature matrix (X) and labels (y) as numpy arrays for quick loading
X = feature_df.drop('ec_class', axis=1).values
y = feature_df['ec_class'].values

np.save('../data/processed/X_features.npy', X)
np.save('../data/processed/y_labels.npy', y)

print(f"""
Summary:
- Total samples: {X.shape[0]}
- Total features: {X.shape[1]}
- Classes: {len(np.unique(y))} (EC 1-7)

""")