In [3]:
import sys
from pathlib import Path
from datetime import datetime
import json
from collections import Counter
import pandas as pd
import xxhash
from tqdm import tqdm

workspace_root = Path.cwd()
sys.path.insert(0, str(workspace_root / 'src'))

from thesis_pipeline.io.config import load_all_configs

print(f"News filtering started: {datetime.now().isoformat()}")
print(f"Workspace: {workspace_root}")

News filtering started: 2025-12-18T23:13:21.015777
Workspace: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment


## 1. Load Configuration and Setup

In [4]:
configs = load_all_configs(workspace_root / 'configs')
news_cfg = configs['news']
global_cfg = configs['global']

# Data directory
silver_dir = workspace_root / 'data/01_silver/news'

print(f"Silver layer: {silver_dir.relative_to(workspace_root)}")
print(f"\nQuality filters to apply:")
print(f"  - Min text length: 200 characters")
print(f"  - Max text length: 100,000 characters")
print(f"  - Must have title")
print(f"  - URL deduplication")
print(f"  - Content deduplication (exact)")

Silver layer: data/01_silver/news

Quality filters to apply:
  - Min text length: 200 characters
  - Max text length: 100,000 characters
  - Must have title
  - URL deduplication
  - Content deduplication (exact)


## 2. Load Existing Data

In [5]:
# Get all files
silver_files = sorted(silver_dir.glob('2016-*.parquet'))

print(f"Found {len(silver_files)} daily files")
print(f"Date range: {silver_files[0].stem} to {silver_files[-1].stem}")

# Load all data
print("\nLoading all files...")
all_articles = []

for file in tqdm(silver_files, desc="Reading files"):
    df = pd.read_parquet(file)
    all_articles.append(df)

df_all = pd.concat(all_articles, ignore_index=True)

print(f"\n✓ Loaded {len(df_all):,} articles")
print(f"  Columns: {df_all.columns.tolist()}")
print(f"  Date range: {df_all['date'].min()} to {df_all['date'].max()}")

Found 61 daily files
Date range: 2016-09-01 to 2016-10-31

Loading all files...


Reading files: 100%|██████████| 61/61 [00:02<00:00, 23.23it/s]


✓ Loaded 93,737 articles
  Columns: ['date', 'requested_url', 'plain_text', 'published_date', 'title', 'tags', 'categories', 'author', 'sitename', 'image_url', 'language', 'language_score', 'responded_url', 'publisher', 'warc_path', 'crawl_date']
  Date range: 2016-09-01 to 2016-10-31





## 3. Initial Data Inspection

In [6]:
print("Data quality snapshot:\n")

# Check for missing values
print("Missing values:")
missing = df_all.isnull().sum()
print(missing[missing > 0])

# Text length distribution
df_all['text_length'] = df_all['plain_text'].fillna('').str.len()
df_all['title_length'] = df_all['title'].fillna('').str.len()

print(f"\nText length stats:")
print(df_all['text_length'].describe())

print(f"\nTitle presence: {df_all['title'].notna().sum():,} / {len(df_all):,} ({100*df_all['title'].notna().sum()/len(df_all):.1f}%)")
print(f"Language distribution:")
print(df_all['language'].value_counts().head(10))

Data quality snapshot:

Missing values:
author       17436
sitename        42
image_url    13035
dtype: int64

Text length stats:
count    93737.000000
mean      2699.781623
std       1807.386186
min        300.000000
25%       1276.000000
50%       2344.000000
75%       3698.000000
max       9997.000000
Name: text_length, dtype: float64

Title presence: 93,737 / 93,737 (100.0%)
Language distribution:
language
en    93737
Name: count, dtype: int64


## 4. Apply Quality Filters

In [7]:
print("Applying quality filters...\n")

initial_count = len(df_all)

# Filter 1: Must have title
df_filtered = df_all[df_all['title'].notna() & (df_all['title'].str.len() > 0)].copy()
print(f"After title filter: {len(df_filtered):,} ({len(df_filtered)/initial_count*100:.1f}%)")

# Filter 2: Text length (200 to 100,000 chars)
df_filtered = df_filtered[(df_filtered['text_length'] >= 200) & (df_filtered['text_length'] <= 100000)]
print(f"After text length filter: {len(df_filtered):,} ({len(df_filtered)/initial_count*100:.1f}%)")

# Filter 3: Must have URL
df_filtered = df_filtered[df_filtered['requested_url'].notna()]
print(f"After URL filter: {len(df_filtered):,} ({len(df_filtered)/initial_count*100:.1f}%)")

# Filter 4: Language should be 'en' (should already be, but double-check)
df_filtered = df_filtered[df_filtered['language'].str.lower() == 'en']
print(f"After language filter: {len(df_filtered):,} ({len(df_filtered)/initial_count*100:.1f}%)")

print(f"\n{'='*80}")
print(f"Articles remaining: {len(df_filtered):,} / {initial_count:,}")
print(f"Removed: {initial_count - len(df_filtered):,} ({(initial_count - len(df_filtered))/initial_count*100:.1f}%)")
print(f"{'='*80}")

Applying quality filters...

After title filter: 93,737 (100.0%)
After text length filter: 93,737 (100.0%)
After URL filter: 93,737 (100.0%)
After language filter: 93,737 (100.0%)

Articles remaining: 93,737 / 93,737
Removed: 0 (0.0%)


## 5. URL Deduplication

In [8]:
print("Deduplicating by URL...\n")

before_dedup = len(df_filtered)

# Check for duplicate URLs
duplicate_urls = df_filtered['requested_url'].duplicated().sum()
print(f"Duplicate URLs found: {duplicate_urls:,}")

# Keep first occurrence (already has smallest hash from sampling)
df_filtered = df_filtered.drop_duplicates(subset=['requested_url'], keep='first')

print(f"After URL deduplication: {len(df_filtered):,}")
print(f"Removed: {before_dedup - len(df_filtered):,}")

Deduplicating by URL...

Duplicate URLs found: 0
After URL deduplication: 93,737
Removed: 0


## 6. Content Deduplication (Exact)

In [9]:
print("Deduplicating by content hash...\n")

before_content_dedup = len(df_filtered)

# Compute content hash (using first 1000 chars to be fast)
df_filtered['content_hash'] = df_filtered['plain_text'].fillna('').str[:1000].apply(
    lambda x: xxhash.xxh64(x.encode('utf-8')).hexdigest()
)

# Check duplicates
duplicate_content = df_filtered['content_hash'].duplicated().sum()
print(f"Duplicate content found: {duplicate_content:,}")

# Keep first occurrence
df_filtered = df_filtered.drop_duplicates(subset=['content_hash'], keep='first')

# Drop the temporary hash column
df_filtered = df_filtered.drop(columns=['content_hash'])

print(f"After content deduplication: {len(df_filtered):,}")
print(f"Removed: {before_content_dedup - len(df_filtered):,}")

Deduplicating by content hash...

Duplicate content found: 68
After content deduplication: 93,669
Removed: 68


## 7. Final Statistics by Date

In [10]:
# Count articles per day
daily_counts = df_filtered.groupby('date').size().reset_index(name='count')
daily_counts = daily_counts.sort_values('date')

print("Articles per day after filtering and deduplication:\n")
print(daily_counts.to_string(index=False))

print(f"\n{'='*80}")
print(f"Total articles: {len(df_filtered):,}")
print(f"Average per day: {daily_counts['count'].mean():.0f}")
print(f"Min per day: {daily_counts['count'].min()}")
print(f"Max per day: {daily_counts['count'].max()}")
print(f"Days with data: {len(daily_counts)}")
print(f"{'='*80}")

Articles per day after filtering and deduplication:

      date  count
2016-09-01    124
2016-09-02    104
2016-09-03     53
2016-09-04     59
2016-09-05     58
2016-09-06    106
2016-09-07     99
2016-09-08    141
2016-09-09    111
2016-09-10     50
2016-09-11    354
2016-09-12    288
2016-09-13    161
2016-09-14    108
2016-09-15    151
2016-09-16    140
2016-09-17     67
2016-09-18     72
2016-09-19    125
2016-09-20    147
2016-09-21    175
2016-09-22    134
2016-09-23    118
2016-09-24     50
2016-09-25     66
2016-09-26    124
2016-09-27    141
2016-09-28    148
2016-09-29    132
2016-09-30    193
2016-10-01     71
2016-10-02     77
2016-10-03    171
2016-10-04    152
2016-10-05    190
2016-10-06    192
2016-10-07    160
2016-10-08     81
2016-10-09     85
2016-10-10    135
2016-10-11    342
2016-10-12    230
2016-10-13    243
2016-10-14    225
2016-10-15     75
2016-10-16    150
2016-10-17   3835
2016-10-18   7098
2016-10-19   7449
2016-10-20   7481
2016-10-21   6898
2016-10-22 

## 8. Write Cleaned Data Back to Silver Layer

In [11]:
print("Writing cleaned data back to silver layer...\n")

# Drop temporary columns
if 'text_length' in df_filtered.columns:
    df_filtered = df_filtered.drop(columns=['text_length'])
if 'title_length' in df_filtered.columns:
    df_filtered = df_filtered.drop(columns=['title_length'])

files_written = []

for date, group in tqdm(df_filtered.groupby('date'), desc="Writing files"):
    output_file = silver_dir / f"{date}.parquet"
    group.to_parquet(output_file, compression='snappy', index=False)
    files_written.append(output_file)

print(f"\n✓ Files written: {len(files_written)}")
print(f"✓ Output directory: {silver_dir.relative_to(workspace_root)}")

Writing cleaned data back to silver layer...



Writing files: 100%|██████████| 61/61 [00:01<00:00, 57.85it/s] 


✓ Files written: 61
✓ Output directory: data/01_silver/news





## 9. Verify Sample File

In [12]:
if files_written:
    sample_file = files_written[len(files_written)//2]
    
    print(f"Sample file: {sample_file.name}")
    print("=" * 80)
    
    df_sample = pd.read_parquet(sample_file)
    
    print(f"Shape: {df_sample.shape}")
    print(f"Columns: {df_sample.columns.tolist()}")
    print(f"\nFirst 2 articles:")
    for idx, row in df_sample.head(2).iterrows():
        print(f"\n  Date: {row['date']}")
        print(f"  Title: {row['title'][:80]}...")
        print(f"  URL: {row['requested_url'][:80]}...")
        print(f"  Text length: {len(row['plain_text'])} chars")
    
    print("\n" + "=" * 80)
else:
    print("⚠ No files written")

Sample file: 2016-10-01.parquet
Shape: (71, 16)
Columns: ['date', 'requested_url', 'plain_text', 'published_date', 'title', 'tags', 'categories', 'author', 'sitename', 'image_url', 'language', 'language_score', 'responded_url', 'publisher', 'warc_path', 'crawl_date']

First 2 articles:

  Date: 2016-10-01
  Title: 10 Propositions for Research-Creation...
  URL: http://quod.lib.umich.edu/j/jep/3336451.0019.206?view=text;rgn=main...
  Text length: 696 chars

  Date: 2016-10-01
  Title: Judge blocks pro-abortion law in Illinois...
  URL: https://world.wng.org/2016/12/judge_blocks_pro_abortion_law_in_illinois...
  Text length: 3672 chars



## 10. Save Processing Metadata

In [13]:
# Update metadata
metadata = {
    'timestamp': datetime.now().isoformat(),
    'processing_stage': 'filtering_and_deduplication',
    'input': {
        'articles_initial': int(initial_count)
    },
    'filters_applied': {
        'must_have_title': True,
        'min_text_length': 200,
        'max_text_length': 100000,
        'must_have_url': True,
        'language': 'en'
    },
    'deduplication': {
        'url_duplicates_removed': int(before_dedup - len(df_filtered)),
        'content_duplicates_removed': int(duplicate_content),
        'method': 'exact_match'
    },
    'output': {
        'articles_final': int(len(df_filtered)),
        'articles_removed': int(initial_count - len(df_filtered)),
        'removal_rate': round((initial_count - len(df_filtered)) / initial_count * 100, 2),
        'files_written': len(files_written),
        'avg_per_day': round(daily_counts['count'].mean(), 1)
    },
    'daily_counts': daily_counts.to_dict('records')
}

metadata_file = silver_dir / 'filtering_metadata.json'
with open(metadata_file, 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"✓ Metadata saved: {metadata_file.relative_to(workspace_root)}")

✓ Metadata saved: data/01_silver/news/filtering_metadata.json
