# Univariate, Bivariate, and Multivariate Analysis
## Genomics Sequence Classification Dataset

This notebook performs comprehensive statistical analysis on genomics sequence data.


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Load data
df = pd.read_csv('../../data/genomics_data.csv')
print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()


## Feature Engineering
Extract features from DNA sequences for analysis


In [None]:
def extract_sequence_features(sequences):
    """Extract features from DNA sequences"""
    from collections import Counter
    features = []
    
    for seq in sequences:
        seq = str(seq).upper()
        length = len(seq)
        gc_content = (seq.count('G') + seq.count('C')) / length if length > 0 else 0
        a_freq = seq.count('A') / length if length > 0 else 0
        t_freq = seq.count('T') / length if length > 0 else 0
        g_freq = seq.count('G') / length if length > 0 else 0
        c_freq = seq.count('C') / length if length > 0 else 0
        
        # K-mer frequencies
        kmer_2 = {}
        for i in range(len(seq) - 1):
            kmer = seq[i:i+2]
            kmer_2[kmer] = kmer_2.get(kmer, 0) + 1
        
        total_kmers = sum(kmer_2.values()) if kmer_2 else 1
        aa_freq = kmer_2.get('AA', 0) / total_kmers
        at_freq = kmer_2.get('AT', 0) / total_kmers
        
        # Sequence entropy
        counts = Counter(seq)
        entropy = -sum((count/len(seq)) * np.log2(count/len(seq)) 
                      for count in counts.values() if count > 0)
        
        features.append({
            'length': length, 'gc_content': gc_content,
            'a_freq': a_freq, 't_freq': t_freq, 'g_freq': g_freq, 'c_freq': c_freq,
            'aa_freq': aa_freq, 'at_freq': at_freq, 'entropy': entropy
        })
    
    return pd.DataFrame(features)

# Extract features
feature_df = extract_sequence_features(df['Sequences'])
df_features = pd.concat([df, feature_df], axis=1)
print("Features extracted!")
df_features.head()


## 1. UNIVARIATE ANALYSIS
Analyzing individual variables in isolation


In [None]:
# Univariate Analysis - Continuous Variables
numeric_cols = ['length', 'gc_content', 'a_freq', 't_freq', 'g_freq', 'c_freq', 'entropy']

fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.ravel()

for idx, col in enumerate(numeric_cols):
    ax = axes[idx]
    ax.hist(df_features[col], bins=30, alpha=0.7, edgecolor='black')
    ax.set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
    ax.set_xlabel(col)
    ax.set_ylabel('Frequency')
    ax.axvline(df_features[col].mean(), color='red', linestyle='--', label=f'Mean: {df_features[col].mean():.3f}')
    ax.axvline(df_features[col].median(), color='green', linestyle='--', label=f'Median: {df_features[col].median():.3f}')
    ax.legend()

plt.tight_layout()
plt.savefig('../../results/univariate_continuous.png', dpi=300, bbox_inches='tight')
plt.show()


## 2. BIVARIATE ANALYSIS
Analyzing relationships between pairs of variables


In [None]:
# Bivariate Analysis - Correlation Matrix
correlation_matrix = df_features[numeric_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt='.3f', cmap='coolwarm', 
            center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix - Feature Relationships', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('../../results/bivariate_correlation.png', dpi=300, bbox_inches='tight')
plt.show()


## 3. MULTIVARIATE ANALYSIS
Analyzing relationships among multiple variables simultaneously


In [None]:
# Multivariate Analysis - PCA
X = df_features[numeric_cols].values
y = df_features['Labels'].values

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Plot PCA
plt.figure(figsize=(12, 8))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', alpha=0.6, edgecolors='black')
plt.colorbar(scatter, label='Label')
plt.xlabel(f'First Principal Component (Explained Variance: {pca.explained_variance_ratio_[0]:.2%})')
plt.ylabel(f'Second Principal Component (Explained Variance: {pca.explained_variance_ratio_[1]:.2%})')
plt.title('PCA - Multivariate Analysis', fontsize=16, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('../../results/multivariate_pca.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"Total Explained Variance: {sum(pca.explained_variance_ratio_):.2%}")
