# Feature Engineering - Temporal Features

This notebook extracts and analyzes temporal features from engagement time series data.

Features include:
- Rolling statistics (mean, std, min, max)
- Ratio features (likes/views, comments/views)
- Burst detection (peaks, max/mean ratio)
- Autocorrelation at different lags
- Entropy and regularity measures
- Trend features


In [None]:
%matplotlib inlineimport sysfrom pathlib import Path# add project root to pathproject_root = Path().resolve().parentsys.path.insert(0, str(project_root))# create output directory for plotsoutput_dir = project_root / "outputs" / "figures"output_dir.mkdir(parents=True, exist_ok=True)import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snsfrom datetime import datetimeimport warningswarnings.filterwarnings('ignore')# set plotting styletry:    plt.style.use('seaborn-v0_8-darkgrid')except OSError:    try:        plt.style.use('seaborn-darkgrid')    except OSError:        plt.style.use('default')sns.set_palette("husl")plt.rcParams['figure.figsize'] = (14, 8)plt.rcParams['font.size'] = 10plt.rcParams['axes.labelsize'] = 12plt.rcParams['axes.titlesize'] = 14plt.rcParams['figure.dpi'] = 100plt.rcParams['savefig.dpi'] = 150plt.rcParams['savefig.bbox'] = 'tight'# try to import PCA/UMAP for visualizationtry:    from sklearn.decomposition import PCA    from sklearn.preprocessing import StandardScaler    HAS_PCA = Trueexcept ImportError:    HAS_PCA = False    print("Warning: sklearn not available for PCA visualization")try:    import umap    HAS_UMAP = Trueexcept ImportError:    HAS_UMAP = False    print("Warning: umap-learn not available for UMAP visualization")# import project modulesfrom src.data.load_data import load_datafrom src.features.temporal_features import (    extract_temporal_features,    save_features,    compute_rolling_statistics,    compute_ratios,    detect_bursts,    compute_autocorrelation,    compute_entropy,    compute_trend_features,)# import IPython display for showing saved imagestry:    from IPython.display import Image, display    HAS_IPYTHON = Trueexcept ImportError:    HAS_IPYTHON = False

: 

## 1. Load Preprocessed Data

Load the preprocessed time series data.


In [None]:
# load datasetdata_path = project_root / "data" / "raw" / "engagement.parquet"df = load_data(data_path)# adapt column names if neededif 'user_id' in df.columns and 'id' not in df.columns:    df['id'] = df['user_id']if 'is_fake_series' in df.columns and 'label' not in df.columns:    df['label'] = df['is_fake_series'].map({True: 'fake', False: 'normal'})print(f"Dataset shape: {df.shape}")print(f"Number of unique users: {df['id'].nunique()}")if 'is_fake_series' in df.columns:    print(f"\nFake series distribution:")    print(df['is_fake_series'].value_counts())if 'label' in df.columns:    print(f"\nLabel distribution:")    print(df['label'].value_counts())

## 2. Extract Temporal Features

Extract all temporal features for each video ID.


In [None]:
# extract features aggregated per user IDprint("Extracting temporal features...")features_df = extract_temporal_features(    df,    id_column="id",    timestamp_column="timestamp",    window_sizes=[6, 12, 24],    autocorr_lags=[1, 6, 12, 24],    aggregate_per_id=True,)print(f"\nFeatures extracted: {features_df.shape}")print(f"Number of feature columns: {len([c for c in features_df.columns if c not in ['id', 'label']])}")feature_cols = [c for c in features_df.columns if c not in ['id', 'label']]print(f"\nFeature columns (first 20):")print(feature_cols[:20])

## 3. Rolling Features Visualization

Show rolling mean, variance, and autocorrelation features for sample series.


In [None]:
# select a sample user seriessample_user_id = df['id'].unique()[0]sample_series = df[df['id'] == sample_user_id].sort_values('timestamp')# compute rolling features for this seriesrolling_mean_6 = sample_series['views'].rolling(window=6, min_periods=1).mean()rolling_std_6 = sample_series['views'].rolling(window=6, min_periods=1).std()rolling_mean_24 = sample_series['views'].rolling(window=24, min_periods=1).mean()rolling_std_24 = sample_series['views'].rolling(window=24, min_periods=1).std()# compute autocorrelation (simplified - using rolling correlation)autocorr_lag_1 = sample_series['views'].rolling(window=12, min_periods=2).apply(    lambda x: x.corr(x.shift(1)) if len(x.dropna()) > 1 else 0, raw=False)# plot series with rolling features (2-row panel)fig, axes = plt.subplots(2, 1, figsize=(16, 10), sharex=True)# top panel: original seriesax1 = axes[0]ax1.plot(sample_series['timestamp'], sample_series['views'],          label='Views', linewidth=2, color='blue', alpha=0.7)ax1.plot(sample_series['timestamp'], rolling_mean_6,          label='Rolling Mean (6h)', linewidth=1.5, color='green', linestyle='--')ax1.plot(sample_series['timestamp'], rolling_mean_24,          label='Rolling Mean (24h)', linewidth=1.5, color='orange', linestyle='--')ax1.fill_between(sample_series['timestamp'],                  rolling_mean_6 - rolling_std_6,                  rolling_mean_6 + rolling_std_6,                 alpha=0.2, color='green', label='Rolling Std (6h)')ax1.set_ylabel('Views', fontsize=12)ax1.set_title(f'Time Series with Rolling Statistics - User: {sample_user_id}',               fontsize=14, fontweight='bold')ax1.legend()ax1.grid(True, alpha=0.3)# bottom panel: feature scoresax2 = axes[1]ax2.plot(sample_series['timestamp'], rolling_std_6,          label='Rolling Std (6h)', linewidth=1.5, color='red')ax2.plot(sample_series['timestamp'], rolling_std_24,          label='Rolling Std (24h)', linewidth=1.5, color='purple')ax2_twin = ax2.twinx()ax2_twin.plot(sample_series['timestamp'], autocorr_lag_1,               label='Autocorr (lag=1)', linewidth=1.5, color='brown', linestyle=':')ax2_twin.set_ylabel('Autocorrelation', fontsize=12, color='brown')ax2_twin.tick_params(axis='y', labelcolor='brown')ax2.set_xlabel('Timestamp', fontsize=12)ax2.set_ylabel('Rolling Std', fontsize=12)ax2.set_title('Feature Scores: Rolling Variance and Autocorrelation',               fontsize=14, fontweight='bold')ax2.legend(loc='upper left')ax2_twin.legend(loc='upper right')ax2.grid(True, alpha=0.3)plt.tight_layout()plt.savefig(output_dir / "02_rolling_features_panel.png", dpi=150, bbox_inches='tight')plt.show()if HAS_IPYTHON and (output_dir / "02_rolling_features_panel.png").exists():    display(Image(str(output_dir / "02_rolling_features_panel.png")))

## 4. PCA/UMAP Visualization - Class Separation

Visualize feature space in 2D using PCA or UMAP to see class separation.


In [None]:
# extract features first if not doneif 'features_df' not in locals():    print("Extracting temporal features...")    features_df = extract_temporal_features(        df,        id_column="id",        timestamp_column="timestamp",        window_sizes=[6, 12, 24],        autocorr_lags=[1, 6, 12, 24],        aggregate_per_id=True,    )    print(f"Features extracted: {features_df.shape}")# prepare data for PCA/UMAPfeature_cols = [c for c in features_df.columns if c not in ['id', 'label']]X = features_df[feature_cols].fillna(0).valuesy = features_df['label'].map({'normal': 0, 'fake': 1}).values# standardize featuresscaler = StandardScaler()X_scaled = scaler.fit_transform(X)# PCA visualizationif HAS_PCA:    pca = PCA(n_components=2, random_state=42)    X_pca = pca.fit_transform(X_scaled)        fig, axes = plt.subplots(1, 2, figsize=(16, 6))        # PCA plot    normal_mask = y == 0    fake_mask = y == 1        axes[0].scatter(X_pca[normal_mask, 0], X_pca[normal_mask, 1],                     alpha=0.6, label='Normal', color='blue', s=30)    axes[0].scatter(X_pca[fake_mask, 0], X_pca[fake_mask, 1],                     alpha=0.6, label='Fake', color='red', s=30)    axes[0].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)', fontsize=12)    axes[0].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)', fontsize=12)    axes[0].set_title('PCA Visualization - Feature Space', fontsize=14, fontweight='bold')    axes[0].legend()    axes[0].grid(True, alpha=0.3)        # UMAP visualization    if HAS_UMAP:        reducer = umap.UMAP(n_components=2, random_state=42, n_neighbors=15, min_dist=0.1)        X_umap = reducer.fit_transform(X_scaled)                axes[1].scatter(X_umap[normal_mask, 0], X_umap[normal_mask, 1],                         alpha=0.6, label='Normal', color='blue', s=30)        axes[1].scatter(X_umap[fake_mask, 0], X_umap[fake_mask, 1],                         alpha=0.6, label='Fake', color='red', s=30)        axes[1].set_xlabel('UMAP 1', fontsize=12)        axes[1].set_ylabel('UMAP 2', fontsize=12)        axes[1].set_title('UMAP Visualization - Feature Space', fontsize=14, fontweight='bold')        axes[1].legend()        axes[1].grid(True, alpha=0.3)    else:        axes[1].text(0.5, 0.5, 'UMAP not available',                      ha='center', va='center', fontsize=14)        axes[1].axis('off')        plt.tight_layout()    plt.savefig(output_dir / "02_pca_umap_visualization.png", dpi=150, bbox_inches='tight')plt.show()if HAS_IPYTHON and (output_dir / "02_pca_umap_visualization.png").exists():    display(Image(str(output_dir / "02_pca_umap_visualization.png")))        print(f"\nPCA explained variance ratio: {pca.explained_variance_ratio_}")    print(f"Total explained variance: {pca.explained_variance_ratio_.sum():.2%}")else:    print("PCA not available. Install sklearn to enable this visualization.")

In [None]:
# extract features aggregated per video IDprint("Extracting temporal features...")features_df = extract_temporal_features(    df,    id_column="id",    timestamp_column="timestamp",    window_sizes=[6, 12, 24],    autocorr_lags=[1, 6, 12, 24],    aggregate_per_id=True,)print(f"\nFeatures extracted: {features_df.shape}")print(f"Number of feature columns: {len([c for c in features_df.columns if c not in ['id', 'label']])}")print(f"\nFeature columns (first 20):")feature_cols = [c for c in features_df.columns if c not in ['id', 'label']]print(feature_cols[:20])

## 5. Feature Importance Heatmap

Visualize feature importance as a heatmap for easy identification of discriminative features.


In [None]:
# create feature importance heatmapif 'importance_df' in locals() and len(importance_df) > 0:    top_30_features = importance_df.head(30)        # prepare data for heatmap    heatmap_data = top_30_features[['normal_mean', 'fake_mean', 'effect_size', 'p_value']].copy()    heatmap_data['-log10(p_value)'] = -np.log10(heatmap_data['p_value'] + 1e-10)    heatmap_data = heatmap_data[['normal_mean', 'fake_mean', 'effect_size', '-log10(p_value)']]    heatmap_data.index = top_30_features['feature']        fig, ax = plt.subplots(1, 1, figsize=(10, max(10, len(top_30_features) * 0.4)))    sns.heatmap(heatmap_data.T, annot=False, fmt='.2f', cmap='YlOrRd',                 cbar_kws={'label': 'Value'}, ax=ax, linewidths=0.5)    ax.set_title('Top 30 Feature Importance Heatmap', fontsize=14, fontweight='bold')    ax.set_xlabel('Feature', fontsize=12)    ax.set_ylabel('Metric', fontsize=12)    ax.set_xticklabels(ax.get_xticklabels(), rotation=if HAS_IPYTHON and output_dir / "02_feature_engineering_01_plot.png".exists():    display(Image(str(output_dir / "02_feature_engineering_01_plot.png")))if HAS_IPYTHON and (output_dir / "02_feature_engineering_01_plot.png").exists():    display(Image(str(output_dir / "02_feature_engineering_01_plot.png")))        # create effect size bar chart    fig, ax = plt.subplots(1, 1, figsize=(12, max(8, len(top_30_features) * 0.3)))    colors = ['red' if p < 0.05 else 'gray' for p in top_30_features['p_value']]    ax.barh(range(len(top_30_features)), top_30_features['effect_size'], color=colors, alpha=0.7)    ax.set_yticks(range(len(top_30_features)))    ax.set_yticklabels(top_30_features['feature'], fontsize=9)    ax.set_xlabel('Effect Size', fontsize=12)    ax.set_title('Top 30 Features by Effect Size (Red = Significant, p<0.05)', fontsize=14, fontweight='bold')    ax.grid(True, alpha=0.3, axis='x')    plt.tight_layout()plt.savefig(output_dir / "02_feature_engineering_02_plot.png", dpi=150, bbox_inches='tight')plt.show()if HAS_IPYTHON and (output_dir / "02_feature_engineering_02_plot.png").exists():    display(Image(str(output_dir / "02_feature_engineering_02_plot.png")))else:    print("Feature importance not computed yet. Run the statistical analysis cell first.")

## 3. Feature Statistics

Analyze the distribution and statistics of extracted features.


In [None]:
# basic statisticsprint("Feature statistics by label:\n")feature_cols = [c for c in features_df.columns if c not in ['id', 'label']]# select a subset of key features for displaykey_features = [    col for col in feature_cols     if any(x in col for x in ['max_mean_ratio', 'n_peaks', 'entropy', 'regularity', 'autocorr_lag_1', 'ratio_likes_views'])]print(f"Key features statistics:")print(features_df.groupby('label')[key_features].describe())

## 4. Feature Distributions - Normal vs Fake

Compare feature distributions between normal and fake engagement patterns.


In [None]:
# select key features for visualizationkey_features_viz = [    'views_max_mean_ratio',    'views_n_peaks',    'views_entropy',    'views_regularity',    'views_autocorr_lag_1',    'ratio_likes_views',    'likes_max_mean_ratio',    'likes_n_peaks',]# filter to available featureskey_features_viz = [f for f in key_features_viz if f in features_df.columns]# plot distributionsn_features = len(key_features_viz)n_cols = 3n_rows = (n_features + n_cols - 1) // n_colsfig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows))axes = axes.flatten() if n_features > 1 else [axes]for idx, feature in enumerate(key_features_viz):    ax = axes[idx]        normal_data = features_df[features_df['label'] == 'normal'][feature].dropna()    fake_data = features_df[features_df['label'] == 'fake'][feature].dropna()        ax.hist(normal_data, bins=30, alpha=0.6, label='Normal', color='blue', density=True)    ax.hist(fake_data, bins=30, alpha=0.6, label='Fake', color='red', density=True)        ax.set_xlabel(feature.replace('_', ' ').title(), fontsize=10)    ax.set_ylabel('Density', fontsize=10)    ax.set_title(f'Distribution: {feature}', fontsize=12, fontweight='bold')    ax.legend()    ax.grid(True, alpha=0.3)# hide extra subplotsfor idx in range(n_features, len(axes)):    axes[idx].axis('off')plt.tight_layout()plt.savefig(output_dir / "02_feature_engineering_03_plot.png", dpi=150, bbox_inches='tight')plt.show()if HAS_IPYTHON and (output_dir / "02_feature_engineering_03_plot.png").exists():    display(Image(str(output_dir / "02_feature_engineering_03_plot.png")))

In [None]:
from scipy.stats import mannwhitneyu# compute statistical significance for each featurefeature_importance = []for feature in feature_cols:    normal_values = features_df[features_df['label'] == 'normal'][feature].dropna()    fake_values = features_df[features_df['label'] == 'fake'][feature].dropna()        if len(normal_values) > 0 and len(fake_values) > 0:        # mann-whitney U test        try:            stat, p_value = mannwhitneyu(normal_values, fake_values, alternative='two-sided')                        # effect size (difference in means normalized by pooled std)            mean_diff = fake_values.mean() - normal_values.mean()            pooled_std = np.sqrt((normal_values.std()**2 + fake_values.std()**2) / 2)            effect_size = mean_diff / (pooled_std + 1e-6)                        feature_importance.append({                'feature': feature,                'normal_mean': normal_values.mean(),                'fake_mean': fake_values.mean(),                'mean_diff': mean_diff,                'effect_size': abs(effect_size),                'p_value': p_value,                'significant': p_value < 0.05,            })        except:            passimportance_df = pd.DataFrame(feature_importance).sort_values('effect_size', ascending=False)print("Top 20 most discriminative features:")print(importance_df.head(20)[['feature', 'normal_mean', 'fake_mean', 'effect_size', 'p_value', 'significant']])

## 6. Feature Distributions Analysis

Visualize distributions of key features: variance, entropy, and burst features.


In [None]:
# select key features for distribution analysisvariance_features = [f for f in feature_cols if 'rolling_std' in f or 'variance' in f][:4]entropy_features = [f for f in feature_cols if 'entropy' in f][:4]burst_features = [f for f in feature_cols if 'peaks' in f or 'max_mean' in f][:4]# create distribution plotsfig, axes = plt.subplots(3, 4, figsize=(18, 12))# variance featuresfor idx, feature in enumerate(variance_features):    if idx < 4:        ax = axes[0, idx]        normal_data = features_df[features_df['label'] == 'normal'][feature].dropna()        fake_data = features_df[features_df['label'] == 'fake'][feature].dropna()                ax.hist(normal_data, bins=30, alpha=0.6, label='Normal', color='blue', density=True)        ax.hist(fake_data, bins=30, alpha=0.6, label='Fake', color='red', density=True)        ax.set_xlabel(feature.replace('_', ' ').title(), fontsize=10)        ax.set_ylabel('Density', fontsize=10)        ax.set_title(f'Variance Feature: {feature.split("_")[-1]}', fontsize=11, fontweight='bold')        ax.legend()        ax.grid(True, alpha=0.3)# entropy featuresfor idx, feature in enumerate(entropy_features):    if idx < 4:        ax = axes[1, idx]        normal_data = features_df[features_df['label'] == 'normal'][feature].dropna()        fake_data = features_df[features_df['label'] == 'fake'][feature].dropna()                ax.hist(normal_data, bins=30, alpha=0.6, label='Normal', color='blue', density=True)        ax.hist(fake_data, bins=30, alpha=0.6, label='Fake', color='red', density=True)        ax.set_xlabel(feature.replace('_', ' ').title(), fontsize=10)        ax.set_ylabel('Density', fontsize=10)        ax.set_title(f'Entropy Feature: {feature.split("_")[0]}', fontsize=11, fontweight='bold')        ax.legend()        ax.grid(True, alpha=0.3)# burst featuresfor idx, feature in enumerate(burst_features):    if idx < 4:        ax = axes[2, idx]        normal_data = features_df[features_df['label'] == 'normal'][feature].dropna()        fake_data = features_df[features_df['label'] == 'fake'][feature].dropna()                ax.hist(normal_data, bins=30, alpha=0.6, label='Normal', color='blue', density=True)        ax.hist(fake_data, bins=30, alpha=0.6, label='Fake', color='red', density=True)        ax.set_xlabel(feature.replace('_', ' ').title(), fontsize=10)        ax.set_ylabel('Density', fontsize=10)        ax.set_title(f'Burst Feature: {feature.split("_")[-1]}', fontsize=11, fontweight='bold')        ax.legend()        ax.grid(True, alpha=0.3)plt.suptitle('Feature Distributions: Variance, Entropy, and Burst Features', fontsize=16, fontweight='bold', y=0.995)plt.tight_layout()plt.savefig(output_dir / "02_feature_engineering_04_plot.png", dpi=150, bbox_inches='tight')plt.show()if HAS_IPYTHON and (output_dir / "02_feature_engineering_04_plot.png").exists():    display(Image(str(output_dir / "02_feature_engineering_04_plot.png")))

## 7. Before/After Normalization Visualization

Compare feature distributions before and after normalization.


In [None]:
# extract features without normalizationfeatures_before = extract_temporal_features(    df,    id_column="id",    timestamp_column="timestamp",    aggregate_per_id=True,    normalize=False)# extract features with normalization (standardize)from src.data.preprocess import normalize_engagement_metricsdf_normalized = df.copy()df_normalized[['views', 'likes', 'comments', 'shares']] = normalize_engagement_metrics(    df_normalized[['views', 'likes', 'comments', 'shares']],    method='standardize')features_after = extract_temporal_features(    df_normalized,    id_column="id",    timestamp_column="timestamp",    aggregate_per_id=True,    normalize=False)# select a few key features for comparisonkey_features = ['views_rolling_mean_6', 'views_rolling_std_6', 'views_entropy', 'views_n_peaks']key_features = [f for f in key_features if f in features_before.columns and f in features_after.columns]# create before/after comparison plotsfig, axes = plt.subplots(2, len(key_features), figsize=(5 * len(key_features), 10))for idx, feature in enumerate(key_features):    # before normalization    ax_before = axes[0, idx]    normal_before = features_before[features_before['label'] == 'normal'][feature].dropna()    fake_before = features_before[features_before['label'] == 'fake'][feature].dropna()        ax_before.hist(normal_before, bins=30, alpha=0.6, label='Normal', color='blue', density=True)    ax_before.hist(fake_before, bins=30, alpha=0.6, label='Fake', color='red', density=True)    ax_before.set_xlabel('Value', fontsize=10)    ax_before.set_ylabel('Density', fontsize=10)    ax_before.set_title(f'Before Normalization\\n{feature}', fontsize=11, fontweight='bold')    ax_before.legend()    ax_before.grid(True, alpha=0.3)        # after normalization    ax_after = axes[1, idx]    normal_after = features_after[features_after['label'] == 'normal'][feature].dropna()    fake_after = features_after[features_after['label'] == 'fake'][feature].dropna()        ax_after.hist(normal_after, bins=30, alpha=0.6, label='Normal', color='blue', density=True)    ax_after.hist(fake_after, bins=30, alpha=0.6, label='Fake', color='red', density=True)    ax_after.set_xlabel('Value', fontsize=10)    ax_after.set_ylabel('Density', fontsize=10)    ax_after.set_title(f'After Normalization\\n{feature}', fontsize=11, fontweight='bold')    ax_after.legend()    ax_after.grid(True, alpha=0.3)plt.suptitle('Feature Distributions: Before vs After Normalization', fontsize=16, fontweight='bold', y=0.995)plt.tight_layout()plt.savefig(output_dir / "02_feature_engineering_05_plot.png", dpi=150, bbox_inches='tight')plt.show()if HAS_IPYTHON and (output_dir / "02_feature_engineering_05_plot.png").exists():    display(Image(str(output_dir / "02_feature_engineering_05_plot.png")))# print statisticsprint("=" * 60)print("NORMALIZATION IMPACT")print("=" * 60)for feature in key_features:    print(f"\\n{feature}:")    print(f"  Before - Normal mean: {features_before[features_before['label'] == 'normal'][feature].mean():.4f}, std: {features_before[features_before['label'] == 'normal'][feature].std():.4f}")    print(f"  Before - Fake mean: {features_before[features_before['label'] == 'fake'][feature].mean():.4f}, std: {features_before[features_before['label'] == 'fake'][feature].std():.4f}")    print(f"  After - Normal mean: {features_after[features_after['label'] == 'normal'][feature].mean():.4f}, std: {features_after[features_after['label'] == 'normal'][feature].std():.4f}")    print(f"  After - Fake mean: {features_after[features_after['label'] == 'fake'][feature].mean():.4f}, std: {features_after[features_after['label'] == 'fake'][feature].std():.4f}")print("=" * 60)

In [None]:
# get top 9 featurestop_features = importance_df.head(9)['feature'].tolist()fig, axes = plt.subplots(3, 3, figsize=(18, 15))axes = axes.flatten()for idx, feature in enumerate(top_features):    ax = axes[idx]        normal_data = features_df[features_df['label'] == 'normal'][feature].dropna()    fake_data = features_df[features_df['label'] == 'fake'][feature].dropna()        # box plot    bp = ax.boxplot([normal_data, fake_data], labels=['Normal', 'Fake'], patch_artist=True)    bp['boxes'][0].set_facecolor('blue')    bp['boxes'][0].set_alpha(0.6)    bp['boxes'][1].set_facecolor('red')    bp['boxes'][1].set_alpha(0.6)        ax.set_ylabel(feature.replace('_', ' ').title(), fontsize=10)    ax.set_title(f'Top Feature #{idx+1}', fontsize=12, fontweight='bold')    ax.grid(True, alpha=0.3, axis='y')plt.tight_layout()plt.savefig(output_dir / "02_feature_engineering_06_plot.png", dpi=150, bbox_inches='tight')plt.show()if HAS_IPYTHON and (output_dir / "02_feature_engineering_06_plot.png").exists():    display(Image(str(output_dir / "02_feature_engineering_06_plot.png")))

## 7. Example: Feature Extraction for Single Video

Demonstrate feature extraction for a single video to understand the process.


In [None]:
# select one normal and one fake videonormal_id = df[df['label'] == 'normal']['id'].iloc[0]fake_id = df[df['label'] == 'fake']['id'].iloc[0]normal_video = df[df['id'] == normal_id].sort_values('timestamp')fake_video = df[df['id'] == fake_id].sort_values('timestamp')print(f"Normal video: {normal_id}")print(f"Fake video: {fake_id}\n")# compute features for views metricprint("=== Rolling Statistics (views) ===")normal_rolling = compute_rolling_statistics(normal_video['views'], window_sizes=[6, 12, 24])print("Normal video - sample rolling features:")print(normal_rolling.head(10))print("\n=== Burst Detection (views) ===")normal_bursts = detect_bursts(normal_video['views'])fake_bursts = detect_bursts(fake_video['views'])print(f"Normal: {normal_bursts}")print(f"Fake: {fake_bursts}")print("\n=== Autocorrelation (views) ===")normal_autocorr = compute_autocorrelation(normal_video['views'], lags=[1, 6, 12, 24])fake_autocorr = compute_autocorrelation(fake_video['views'], lags=[1, 6, 12, 24])print(f"Normal: {normal_autocorr}")print(f"Fake: {fake_autocorr}")print("\n=== Entropy (views) ===")normal_entropy = compute_entropy(normal_video['views'])fake_entropy = compute_entropy(fake_video['views'])print(f"Normal: {normal_entropy}")print(f"Fake: {fake_entropy}")print("\n=== Ratios ===")normal_ratios = compute_ratios(normal_video, numerator_cols=['likes', 'comments'], denominator_col='views')print("Normal video - sample ratios:")print(normal_ratios.head(10))

## 8. Save Features

Save the extracted features to disk for use in modeling.


In [None]:
# save featuresoutput_path = project_root / "data" / "processed" / "temporal_features.parquet"save_features(    features_df,    output_path=str(output_path),    output_format="parquet",)print(f"\nFeatures saved successfully!")print(f"Total features: {len(feature_cols)}")print(f"Total videos: {len(features_df)}")print(f"Normal videos: {len(features_df[features_df['label'] == 'normal'])}")print(f"Fake videos: {len(features_df[features_df['label'] == 'fake'])}")

## 9. Feature Correlation Analysis

Analyze correlations between features to identify redundancy.


In [None]:
# compute correlation matrix for top featurestop_20_features = importance_df.head(20)['feature'].tolist()top_20_features = [f for f in top_20_features if f in features_df.columns]corr_matrix = features_df[top_20_features].corr()# plot correlation heatmapplt.figure(figsize=(14, 12))sns.heatmap(    corr_matrix,    annot=False,    cmap='coolwarm',    center=0,    vmin=-1,    vmax=1,    square=True,    fmt='.2f',)plt.title('Feature Correlation Matrix (Top 20 Features)', fontsize=14, fontweight='bold')plt.tight_layout()plt.savefig(output_dir / "02_feature_engineering_07_plot.png", dpi=150, bbox_inches='tight')plt.show()if HAS_IPYTHON and (output_dir / "02_feature_engineering_07_plot.png").exists():    display(Image(str(output_dir / "02_feature_engineering_07_plot.png")))# find highly correlated feature pairsprint("\nHighly correlated feature pairs (|correlation| > 0.8):")high_corr_pairs = []for i in range(len(corr_matrix.columns)):    for j in range(i+1, len(corr_matrix.columns)):        corr_val = corr_matrix.iloc[i, j]        if abs(corr_val) > 0.8:            high_corr_pairs.append((                corr_matrix.columns[i],                corr_matrix.columns[j],                corr_val            ))if high_corr_pairs:    for feat1, feat2, corr in high_corr_pairs[:10]:        print(f"{feat1} <-> {feat2}: {corr:.3f}")else:    print("No highly correlated pairs found (threshold: 0.8)")

## 10. Summary

Summary of feature engineering results.


In [None]:
print("=" * 60)print("FEATURE ENGINEERING SUMMARY")print("=" * 60)print(f"\n1. Total features extracted: {len(feature_cols)}")print(f"   - Rolling statistics: {len([f for f in feature_cols if 'rolling' in f])}")print(f"   - Burst features: {len([f for f in feature_cols if 'peaks' in f or 'max_mean' in f])}")print(f"   - Autocorrelation: {len([f for f in feature_cols if 'autocorr' in f])}")print(f"   - Entropy/Regularity: {len([f for f in feature_cols if 'entropy' in f or 'regularity' in f])}")print(f"   - Ratio features: {len([f for f in feature_cols if 'ratio' in f])}")print(f"   - Trend features: {len([f for f in feature_cols if 'trend' in f])}")print(f"\n2. Statistical significance:")significant_features = importance_df[importance_df['significant'] == True]print(f"   - Significant features (p < 0.05): {len(significant_features)}")print(f"   - Top 5 most discriminative:")for idx, row in importance_df.head(5).iterrows():    print(f"     {row['feature']}: effect_size={row['effect_size']:.3f}, p={row['p_value']:.4f}")print(f"\n3. Feature quality:")print(f"   - Features saved to: {output_path}")print(f"   - Ready for baseline models (tree-based, anomaly detection)")print(f"   - Ready for deep learning models (as additional features)")print("=" * 60)