In [2]:
import sys
from pathlib import Path
from datetime import datetime, timedelta
import json
from collections import Counter
from urllib.parse import urlparse
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

workspace_root = Path.cwd()
sys.path.insert(0, str(workspace_root / 'src'))

from thesis_pipeline.io.config import load_all_configs

# Set plot style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

print(f"News QC report generation started: {datetime.now().isoformat()}")
print(f"Workspace: {workspace_root}")

News QC report generation started: 2025-12-19T09:22:28.704475
Workspace: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment


## 1. Load Configuration and Data

In [3]:
configs = load_all_configs(workspace_root / 'configs')
news_cfg = configs['news']
global_cfg = configs['global']

# Paths
silver_dir = workspace_root / 'data/01_silver/news'
report_dir = workspace_root / 'reports/data_validation/2016-09_2016-10/news'
figures_dir = report_dir / 'figures'
tables_dir = report_dir / 'tables'

# Create directories
report_dir.mkdir(parents=True, exist_ok=True)
figures_dir.mkdir(exist_ok=True)
tables_dir.mkdir(exist_ok=True)

print(f"Silver layer: {silver_dir.relative_to(workspace_root)}")
print(f"Report output: {report_dir.relative_to(workspace_root)}")

Silver layer: data/01_silver/news
Report output: reports/data_validation/2016-09_2016-10/news


In [4]:
# Load all parquet files
silver_files = sorted(silver_dir.glob('2016-*.parquet'))
print(f"Found {len(silver_files)} daily files")

print("Loading all data...")
all_articles = []

for file in tqdm(silver_files, desc="Reading files"):
    df = pd.read_parquet(file)
    all_articles.append(df)

df = pd.concat(all_articles, ignore_index=True)

print(f"\nâœ“ Loaded {len(df):,} articles")
print(f"  Date range: {df['date'].min()} to {df['date'].max()}")
print(f"  Columns: {df.columns.tolist()}")

Found 61 daily files
Loading all data...


Reading files: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 61/61 [00:01<00:00, 55.55it/s]


âœ“ Loaded 93,669 articles
  Date range: 2016-09-01 to 2016-10-31
  Columns: ['date', 'requested_url', 'plain_text', 'published_date', 'title', 'tags', 'categories', 'author', 'sitename', 'image_url', 'language', 'language_score', 'responded_url', 'publisher', 'warc_path', 'crawl_date']





## 2. Temporal Coverage Analysis

In [5]:
# Expected date range
start_date = '2016-09-01'
end_date = '2016-10-31'
expected_dates = pd.date_range(start_date, end_date, freq='D')
expected_dates_str = [d.strftime('%Y-%m-%d') for d in expected_dates]

# Actual dates
actual_dates = sorted(df['date'].unique())

# Coverage
coverage_pct = len(actual_dates) / len(expected_dates_str) * 100
missing_dates = set(expected_dates_str) - set(actual_dates)

print(f"Temporal Coverage:")
print(f"  Expected: {len(expected_dates_str)} days ({start_date} to {end_date})")
print(f"  Actual: {len(actual_dates)} days")
print(f"  Coverage: {coverage_pct:.1f}%")

if missing_dates:
    print(f"\nâš  Missing dates ({len(missing_dates)}): {sorted(missing_dates)}")
else:
    print(f"\nâœ“ Complete coverage (no missing dates)")

Temporal Coverage:
  Expected: 61 days (2016-09-01 to 2016-10-31)
  Actual: 61 days
  Coverage: 100.0%

âœ“ Complete coverage (no missing dates)


## 3. Daily Article Counts

In [6]:
# Count by date
daily_counts = df.groupby('date').size().reset_index(name='count')
daily_counts = daily_counts.sort_values('date')

print("Daily article statistics:")
print(f"  Total articles: {len(df):,}")
print(f"  Mean per day: {daily_counts['count'].mean():.1f}")
print(f"  Median per day: {daily_counts['count'].median():.0f}")
print(f"  Min per day: {daily_counts['count'].min()}")
print(f"  Max per day: {daily_counts['count'].max()}")
print(f"  Std dev: {daily_counts['count'].std():.1f}")

# Save table
daily_counts.to_csv(tables_dir / 'daily_article_counts.csv', index=False)
print(f"\nâœ“ Saved: {(tables_dir / 'daily_article_counts.csv').relative_to(workspace_root)}")

Daily article statistics:
  Total articles: 93,669
  Mean per day: 1535.6
  Median per day: 150
  Min per day: 50
  Max per day: 7481
  Std dev: 2593.0

âœ“ Saved: reports/data_validation/2016-09_2016-10/news/tables/daily_article_counts.csv


In [7]:
# Plot daily counts
fig, ax = plt.subplots(figsize=(14, 6))

ax.plot(daily_counts['date'], daily_counts['count'], marker='o', linewidth=1.5, markersize=3)
ax.axhline(y=daily_counts['count'].mean(), color='r', linestyle='--', label=f'Mean: {daily_counts["count"].mean():.0f}')
ax.set_xlabel('Date')
ax.set_ylabel('Article Count')
ax.set_title('Daily News Article Counts: Sep-Oct 2016')
ax.legend()
ax.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()

fig.savefig(figures_dir / 'daily_article_counts.png', dpi=300, bbox_inches='tight')
plt.close()

print(f"âœ“ Saved: {(figures_dir / 'daily_article_counts.png').relative_to(workspace_root)}")

âœ“ Saved: reports/data_validation/2016-09_2016-10/news/figures/daily_article_counts.png


## 4. Text Length Distribution

In [8]:
# Compute text lengths
df['text_length'] = df['plain_text'].str.len()
df['title_length'] = df['title'].str.len()

print("Text length statistics:")
print("\nArticle text:")
print(df['text_length'].describe())
print("\nArticle title:")
print(df['title_length'].describe())

Text length statistics:

Article text:
count    93669.000000
mean      2698.933382
std       1806.848335
min        300.000000
25%       1275.000000
50%       2343.000000
75%       3698.000000
max       9997.000000
Name: text_length, dtype: float64

Article title:
count    93669.000000
mean        57.756995
std         21.970231
min          3.000000
25%         44.000000
50%         57.000000
75%         69.000000
max        264.000000
Name: title_length, dtype: float64


In [9]:
# Plot text length distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Article text length
axes[0].hist(df['text_length'], bins=50, edgecolor='black', alpha=0.7)
axes[0].axvline(df['text_length'].median(), color='r', linestyle='--', label=f'Median: {df["text_length"].median():.0f}')
axes[0].set_xlabel('Text Length (characters)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Article Text Length Distribution')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Title length
axes[1].hist(df['title_length'], bins=50, edgecolor='black', alpha=0.7, color='orange')
axes[1].axvline(df['title_length'].median(), color='r', linestyle='--', label=f'Median: {df["title_length"].median():.0f}')
axes[1].set_xlabel('Title Length (characters)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Article Title Length Distribution')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
fig.savefig(figures_dir / 'text_length_distribution.png', dpi=300, bbox_inches='tight')
plt.close()

print(f"âœ“ Saved: {(figures_dir / 'text_length_distribution.png').relative_to(workspace_root)}")

âœ“ Saved: reports/data_validation/2016-09_2016-10/news/figures/text_length_distribution.png


## 5. Domain/Source Analysis

In [10]:
# Extract domains from URLs
def extract_domain(url):
    try:
        parsed = urlparse(url)
        domain = parsed.netloc
        # Remove www. prefix
        if domain.startswith('www.'):
            domain = domain[4:]
        return domain
    except:
        return None

df['domain'] = df['requested_url'].apply(extract_domain)

# Top domains
top_domains = df['domain'].value_counts().head(30)

print(f"Unique domains: {df['domain'].nunique():,}")
print(f"\nTop 30 domains:")
print(top_domains)

# Save table
top_domains.to_csv(tables_dir / 'top_30_domains.csv', header=['count'])
print(f"\nâœ“ Saved: {(tables_dir / 'top_30_domains.csv').relative_to(workspace_root)}")

Unique domains: 1,957

Top 30 domains:
domain
thehindu.com                   3735
theguardian.com                2825
modernghana.com                2269
irishtimes.com                 2263
seattletimes.com               2067
dailyrecord.co.uk              2005
firmenpresse.de                1710
theglobeandmail.com            1610
metronews.ca                   1566
finanznachrichten.de           1269
manchestereveningnews.co.uk    1251
cnn.com                        1159
reuters.com                    1143
scotsman.com                   1061
nation.co.ke                    985
arkansasonline.com              941
reviewjournal.com               927
taipeitimes.com                 875
yorkshirepost.co.uk             832
timesofmalta.com                828
mercedsunstar.com               814
miamiherald.com                 805
sunherald.com                   794
ledger-enquirer.com             681
grandforksherald.com            678
charlotteobserver.com           657
bnd.com           

In [11]:
# Plot top 20 domains
fig, ax = plt.subplots(figsize=(12, 8))

top_20 = df['domain'].value_counts().head(20)
ax.barh(range(len(top_20)), top_20.values, color='steelblue')
ax.set_yticks(range(len(top_20)))
ax.set_yticklabels(top_20.index)
ax.invert_yaxis()
ax.set_xlabel('Article Count')
ax.set_title('Top 20 News Domains')
ax.grid(True, alpha=0.3, axis='x')

plt.tight_layout()
fig.savefig(figures_dir / 'top_domains.png', dpi=300, bbox_inches='tight')
plt.close()

print(f"âœ“ Saved: {(figures_dir / 'top_domains.png').relative_to(workspace_root)}")

âœ“ Saved: reports/data_validation/2016-09_2016-10/news/figures/top_domains.png


## 6. Language Verification

In [12]:
# Language distribution
language_counts = df['language'].value_counts()

print("Language distribution:")
print(language_counts)

english_pct = (df['language'].str.lower() == 'en').sum() / len(df) * 100
print(f"\nEnglish articles: {(df['language'].str.lower() == 'en').sum():,} ({english_pct:.2f}%)")

if english_pct < 99.9:
    print(f"\nâš  Non-English articles detected:")
    non_english = df[df['language'].str.lower() != 'en']
    print(non_english[['date', 'title', 'language']].head(10))
else:
    print(f"\nâœ“ All articles are in English")

Language distribution:
language
en    93669
Name: count, dtype: int64

English articles: 93,669 (100.00%)

âœ“ All articles are in English


## 7. Data Completeness Check

In [13]:
# Check for missing/null values
print("Data completeness:")
print("\nMissing values per column:")
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(2)
completeness_df = pd.DataFrame({
    'missing_count': missing,
    'missing_pct': missing_pct
})
print(completeness_df[completeness_df['missing_count'] > 0])

# Required fields
required_fields = ['date', 'title', 'plain_text', 'requested_url']
print(f"\nRequired fields check:")
for field in required_fields:
    null_count = df[field].isnull().sum()
    if null_count == 0:
        print(f"  âœ“ {field}: complete")
    else:
        print(f"  âœ— {field}: {null_count} missing ({null_count/len(df)*100:.2f}%)")

Data completeness:

Missing values per column:
           missing_count  missing_pct
author             17422        18.60
sitename              42         0.04
image_url          13034        13.91

Required fields check:
  âœ“ date: complete
  âœ“ title: complete
  âœ“ plain_text: complete
  âœ“ requested_url: complete


## 8. Sample Articles

In [14]:
# Show random sample
sample = df.sample(n=min(5, len(df)), random_state=42)

print("Sample articles:\n")
for idx, row in sample.iterrows():
    print(f"{'='*80}")
    print(f"Date: {row['date']}")
    print(f"Title: {row['title']}")
    print(f"Domain: {row['domain']}")
    print(f"URL: {row['requested_url'][:100]}...")
    print(f"Text length: {row['text_length']} chars")
    print(f"Text preview: {row['plain_text'][:200]}...")
    print()

Sample articles:

Date: 2016-10-18
Title: Mattapoisett Bike Path
Domain: wanderer.com
URL: http://www.wanderer.com/happenings/mattapoisett-bike-path-5/...
Text length: 845 chars
Text preview: The Mattapoisett Bike Path project has reached and passed an important milestone in the project process. The environmental permitting process began in early September with a submission to MEPA, the Ma...

Date: 2016-09-12
Title: Noahide Lecture: The purpose behind the test
Domain: israelnationalnews.com
URL: http://www.israelnationalnews.com/News/News.aspx/221500...
Text length: 1065 chars
Text preview: Noahide Lecture: The purpose behind the test What we can learn from Jacob's travel and his tests in Haran? Contact Editor Rod Bryant, 09/12/16 14:20 Rod BryantBy PR Discover the beauty of lifeâ€™s test:...

Date: 2016-10-28
Title: Air Canada flight bound for Montreal lands in London following medical emergency
Domain: theglobeandmail.com
URL: http://www.theglobeandmail.com/news/national/air-canada-

## 9. Generate Summary Statistics

In [15]:
# Compile summary statistics
summary_stats = {
    'temporal_coverage': {
        'start_date': start_date,
        'end_date': end_date,
        'expected_days': len(expected_dates_str),
        'actual_days': len(actual_dates),
        'coverage_pct': round(coverage_pct, 2),
        'missing_dates': sorted(list(missing_dates))
    },
    'dataset_size': {
        'total_articles': int(len(df)),
        'articles_per_day': {
            'mean': round(daily_counts['count'].mean(), 1),
            'median': int(daily_counts['count'].median()),
            'min': int(daily_counts['count'].min()),
            'max': int(daily_counts['count'].max()),
            'std': round(daily_counts['count'].std(), 1)
        }
    },
    'text_statistics': {
        'article_text_length': {
            'mean': round(df['text_length'].mean(), 1),
            'median': int(df['text_length'].median()),
            'min': int(df['text_length'].min()),
            'max': int(df['text_length'].max())
        },
        'title_length': {
            'mean': round(df['title_length'].mean(), 1),
            'median': int(df['title_length'].median()),
            'min': int(df['title_length'].min()),
            'max': int(df['title_length'].max())
        }
    },
    'sources': {
        'unique_domains': int(df['domain'].nunique()),
        'top_10_domains': top_domains.head(10).to_dict()
    },
    'language': {
        'distribution': language_counts.to_dict(),
        'english_pct': round(english_pct, 2)
    },
    'data_quality': {
        'completeness': completeness_df.to_dict()
    }
}

# Save as JSON
stats_file = report_dir / 'summary_statistics.json'
with open(stats_file, 'w') as f:
    json.dump(summary_stats, f, indent=2)

print(f"âœ“ Saved: {stats_file.relative_to(workspace_root)}")

âœ“ Saved: reports/data_validation/2016-09_2016-10/news/summary_statistics.json


## 10. Generate Markdown Report

In [16]:
# Generate comprehensive markdown report
report_lines = [
    "# News Dataset Quality Control Report",
    "",
    f"**Dataset Period:** September - October 2016",
    f"**Report Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
    f"**Source:** CC-NEWS via HuggingFace",
    "",
    "---",
    "",
    "## 1. Executive Summary",
    "",
    f"- **Total Articles:** {len(df):,}",
    f"- **Date Range:** {df['date'].min()} to {df['date'].max()}",
    f"- **Temporal Coverage:** {len(actual_dates)}/{len(expected_dates_str)} days ({coverage_pct:.1f}%)",
    f"- **Unique Domains:** {df['domain'].nunique():,}",
    f"- **Language:** {english_pct:.2f}% English",
    "",
    "---",
    "",
    "## 2. Temporal Coverage",
    "",
    f"- **Expected Days:** {len(expected_dates_str)} (2016-09-01 to 2016-10-31)",
    f"- **Actual Days:** {len(actual_dates)}",
    f"- **Coverage:** {coverage_pct:.1f}%",
]

if missing_dates:
    report_lines.append(f"- **Missing Dates ({len(missing_dates)}):** {', '.join(sorted(missing_dates))}")
else:
    report_lines.append("- **Status:** âœ“ Complete coverage (no missing dates)")

report_lines.extend([
    "",
    "### Daily Article Counts",
    "",
    f"- **Mean:** {daily_counts['count'].mean():.1f} articles/day",
    f"- **Median:** {daily_counts['count'].median():.0f} articles/day",
    f"- **Range:** {daily_counts['count'].min()} - {daily_counts['count'].max()} articles/day",
    f"- **Std Dev:** {daily_counts['count'].std():.1f}",
    "",
    "![Daily Article Counts](figures/daily_article_counts.png)",
    "",
    "ðŸ“Š **See detailed table:** [daily_article_counts.csv](tables/daily_article_counts.csv)",
    "",
    "---",
    "",
    "## 3. Text Length Analysis",
    "",
    "### Article Text",
    "",
    f"- **Mean Length:** {df['text_length'].mean():.0f} characters",
    f"- **Median Length:** {df['text_length'].median():.0f} characters",
    f"- **Range:** {df['text_length'].min()} - {df['text_length'].max():,} characters",
    "",
    "### Article Titles",
    "",
    f"- **Mean Length:** {df['title_length'].mean():.0f} characters",
    f"- **Median Length:** {df['title_length'].median():.0f} characters",
    f"- **Range:** {df['title_length'].min()} - {df['title_length'].max()} characters",
    "",
    "![Text Length Distribution](figures/text_length_distribution.png)",
    "",
    "---",
    "",
    "## 4. Domain/Source Analysis",
    "",
    f"- **Unique Domains:** {df['domain'].nunique():,}",
    f"- **Top Domain:** {top_domains.index[0]} ({top_domains.values[0]:,} articles)",
    "",
    "### Top 10 Domains",
    "",
])

for i, (domain, count) in enumerate(top_domains.head(10).items(), 1):
    report_lines.append(f"{i}. **{domain}**: {count:,} articles")

report_lines.extend([
    "",
    "![Top Domains](figures/top_domains.png)",
    "",
    "ðŸ“Š **See full list:** [top_30_domains.csv](tables/top_30_domains.csv)",
    "",
    "---",
    "",
    "## 5. Language Distribution",
    "",
    f"- **English:** {(df['language'].str.lower() == 'en').sum():,} articles ({english_pct:.2f}%)",
])

if english_pct < 99.9:
    for lang, count in language_counts.items():
        if lang.lower() != 'en':
            report_lines.append(f"- **{lang}:** {count:,} articles ({count/len(df)*100:.2f}%)")
else:
    report_lines.append("- **Status:** âœ“ All articles are in English")

report_lines.extend([
    "",
    "---",
    "",
    "## 6. Data Completeness",
    "",
    "### Required Fields",
    "",
])

for field in required_fields:
    null_count = df[field].isnull().sum()
    if null_count == 0:
        report_lines.append(f"- âœ“ **{field}**: Complete (0 missing)")
    else:
        report_lines.append(f"- âœ— **{field}**: {null_count:,} missing ({null_count/len(df)*100:.2f}%)")

report_lines.extend([
    "",
    "---",
    "",
    "## 7. Summary",
    "",
    "### Dataset Quality Assessment",
    "",
    f"âœ“ **Temporal Coverage:** {coverage_pct:.1f}% ({len(actual_dates)}/{len(expected_dates_str)} days)",
    f"âœ“ **Dataset Size:** {len(df):,} articles",
    f"âœ“ **Language Quality:** {english_pct:.2f}% English",
    f"âœ“ **Source Diversity:** {df['domain'].nunique():,} unique domains",
    f"âœ“ **Data Completeness:** All required fields present",
    "",
    "### Files Generated",
    "",
    "- `summary_statistics.json` - Complete statistics in JSON format",
    "- `figures/daily_article_counts.png` - Daily counts visualization",
    "- `figures/text_length_distribution.png` - Text length analysis",
    "- `figures/top_domains.png` - Top domains visualization",
    "- `tables/daily_article_counts.csv` - Daily counts data",
    "- `tables/top_30_domains.csv` - Top 30 domains data",
    "",
    "---",
    "",
    f"**Report completed:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
])

# Write report
report_file = report_dir / 'qc_summary.md'
with open(report_file, 'w') as f:
    f.write('\n'.join(report_lines))

print(f"\n{'='*80}")
print(f"âœ“ QC Report generated: {report_file.relative_to(workspace_root)}")
print(f"{'='*80}")


âœ“ QC Report generated: reports/data_validation/2016-09_2016-10/news/qc_summary.md
