In [5]:
"""Initial data exploration and quality assessment"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import logging

# Constants
TEXT_COLUMNS = ['original_title', 'original_intro', 'summary', 'body']
DATE_COLUMNS = ['first_seen_date', 'original_time', 'scraped_date', 'download_timestamp']
FILE_PATH = "../data/processed/articles.csv"

# Load Data
print("=== DATA LOADING AND INITIAL INSPECTION ===")
df = pd.read_csv(FILE_PATH, sep=';', quoting=1)
print(f"Loaded {len(df)} articles")
print("\nDataFrame Info:")
print(df.info())

# Data Quality Analysis
print("\n=== DATA QUALITY ANALYSIS ===")
for col in df.columns:
    unique_count = df[col].nunique()
    missing = df[col].isnull().sum()
    empty_str = (df[col] == '').sum() if df[col].dtype == 'object' else 0
    
    print(f"\nColumn: {col}")
    print(f"Data type: {df[col].dtype}")
    print(f"Unique values: {unique_count}")
    print(f"NULL values: {missing}")
    print(f"Empty strings: {empty_str}")

# Basic Text Statistics
print("\n=== TEXT STATISTICS ===")
for col in TEXT_COLUMNS:
    if col in df.columns:
        lengths = df[col].str.len()
        print(f"\n{col.upper()}:")
        print(lengths.describe())
        

=== DATA LOADING AND INITIAL INSPECTION ===
Loaded 334221 articles

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 334221 entries, 0 to 334220
Data columns (total 14 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   url                 334221 non-null  object 
 1   first_seen_date     334221 non-null  object 
 2   title               334076 non-null  object 
 3   subtitle            182548 non-null  object 
 4   summary             182970 non-null  object 
 5   body                324105 non-null  object 
 6   category            334221 non-null  object 
 7   published_date      334221 non-null  object 
 8   modified_date       334221 non-null  object 
 9   tags                79109 non-null   object 
 10  image_url           334221 non-null  object 
 11  author              0 non-null       float64
 12  download_timestamp  334221 non-null  object 
 13  source              334221 non-null  object 
dtype