# 01 - Exploratory Data Analysis

Stage 0 of the critique detection pipeline: load, validate, profile, and
visualize the YouTube comment dataset.

In [None]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from src.data_ingest import ingest, profile_data, save_profile
from src.preprocess import preprocess_dataframe

## Load and Validate Data

In [None]:
# Update this path to your input file
DATA_PATH = '../data/raw/comments.jsonl'  # or .csv / .json

df = ingest(DATA_PATH)
profile = df.attrs.get('profile', profile_data(df))
print(f"Loaded {len(df)} comments")
print(f"\nProfile summary:")
for k, v in profile.items():
    if not isinstance(v, dict):
        print(f"  {k}: {v}")

## Preprocess

In [None]:
df = preprocess_dataframe(df)
print(f"Columns: {list(df.columns)}")
df.head()

## Text Length Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(df['text_length'], bins=100, edgecolor='black', alpha=0.7)
axes[0].set_title('Comment Length Distribution')
axes[0].set_xlabel('Characters')
axes[0].set_ylabel('Count')
axes[0].set_xlim(0, 1000)

axes[1].hist(df['word_count'], bins=100, edgecolor='black', alpha=0.7, color='orange')
axes[1].set_title('Word Count Distribution')
axes[1].set_xlabel('Words')
axes[1].set_ylabel('Count')
axes[1].set_xlim(0, 200)

plt.tight_layout()
plt.show()

## Language Distribution

In [None]:
lang_counts = df['language'].value_counts().head(15)
lang_counts.plot(kind='barh', figsize=(10, 6), color='steelblue')
plt.title('Top 15 Languages')
plt.xlabel('Count')
plt.tight_layout()
plt.show()

## Feature Correlations

In [None]:
feature_cols = ['text_length', 'word_count', 'punctuation_ratio', 'caps_ratio',
                'emoji_count', 'exclamation_count', 'question_mark_count']
df[feature_cols].describe().round(3)

## Trivial/Empty Comment Stats

In [None]:
if 'is_trivial' in df.columns:
    trivial_pct = df['is_trivial'].mean() * 100
    print(f"Trivial comments: {df['is_trivial'].sum()} ({trivial_pct:.1f}%)")
if '_emoji_only' in df.columns:
    emoji_pct = df['_emoji_only'].mean() * 100
    print(f"Emoji-only comments: {df['_emoji_only'].sum()} ({emoji_pct:.1f}%)")