In [None]:
"""Initial data exploration and quality assessment"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import logging

# Constants
TEXT_COLUMNS = ['original_title', 'original_intro', 'summary', 'body']
DATE_COLUMNS = ['first_seen_date', 'original_time', 'scraped_date', 'download_timestamp']
FILE_PATH = "data/processed/articles.csv"

# Load Data
print("=== DATA LOADING AND INITIAL INSPECTION ===")
df = pd.read_csv(FILE_PATH, sep=';', quoting=1)
print(f"Loaded {len(df)} articles")
print("\nDataFrame Info:")
print(df.info())

# Data Quality Analysis
print("\n=== DATA QUALITY ANALYSIS ===")
for col in df.columns:
    unique_count = df[col].nunique()
    missing = df[col].isnull().sum()
    empty_str = (df[col] == '').sum() if df[col].dtype == 'object' else 0
    
    print(f"\nColumn: {col}")
    print(f"Data type: {df[col].dtype}")
    print(f"Unique values: {unique_count}")
    print(f"NULL values: {missing}")
    print(f"Empty strings: {empty_str}")

# Basic Text Statistics
print("\n=== TEXT STATISTICS ===")
for col in TEXT_COLUMNS:
    if col in df.columns:
        lengths = df[col].str.len()
        print(f"\n{col.upper()}:")
        print(lengths.describe())