In [6]:
import importlib.util
from pathlib import Path

# Get the path to the module
module_path = Path().absolute().parent / "03_uttils" / "02_logging_config.py"
module_name = "logging_config"

# Load the module
spec = importlib.util.spec_from_file_location(module_name, module_path)
logging_config = importlib.util.module_from_spec(spec)
spec.loader.exec_module(logging_config)

# Now you can use it
logger = logging_config.configure_logging()

FileNotFoundError: [Errno 2] No such file or directory: 'c:\\Users\\Morobang\\Documents\\GitHub\\English Premier League & Championship Analysis\\03_uttils\\02_logging_config.py'

In [None]:

logger = get_logger("data_validation")

# -----------------------------
# Path to cleaned data
# -----------------------------
base_dir = "03_utils/cleaned_data"

# Find latest year/month/day folder
latest_year = sorted(os.listdir(base_dir))[-1]
latest_month = sorted(os.listdir(os.path.join(base_dir, latest_year)))[-1]
latest_day = sorted(os.listdir(os.path.join(base_dir, latest_year, latest_month)))[-1]

latest_folder = os.path.join(base_dir, latest_year, latest_month, latest_day)
logger.info(f"Looking in folder: {latest_folder}")

# Find all CSV files in that folder
csv_files = [f for f in os.listdir(latest_folder) if f.endswith(".csv")]

if not csv_files:
    logger.error("No CSV files found in the latest folder!")
else:
    # Pick the CSV with the latest modified time
    latest_csv = max(
        [os.path.join(latest_folder, f) for f in csv_files],
        key=os.path.getmtime
    )
    df = pd.read_csv(latest_csv, parse_dates=['date'])
    logger.info(f"Loaded latest cleaned data: {latest_csv} with {len(df)} rows")
# 3. Basic Data Info
logger.info(f"Columns: {df.columns.tolist()}")
logger.info(f"Data types:\n{df.dtypes}")
logger.info(f"Missing values:\n{df.isnull().sum()}")

# 4. Validation Checks

# 4.1 Score consistency: home goals > away goals if ft_result == H
score_inconsistency = df[(df['ft_result'] == 'H') & (df['fth_goals'] <= df['fta_goals'])]
if len(score_inconsistency) > 0:
    logger.warning(f"Found {len(score_inconsistency)} matches with inconsistent scores (H result)")
else:
    logger.info("No inconsistent home-win scores found")

# 4.2 Non-negative stats
numeric_cols = ['fth_goals', 'fta_goals', 'h_shots', 'a_shots', 'h_sot', 'a_sot', 
                'h_corners', 'a_corners', 'h_yellow', 'a_yellow', 'h_red', 'a_red']
negative_values = df[numeric_cols][(df[numeric_cols] < 0).any(axis=1)]
if len(negative_values) > 0:
    logger.warning(f"Found {len(negative_values)} rows with negative values in numeric columns")
else:
    logger.info("No negative values in numeric columns")

# 4.3 Date range check
if not df['date'].between('2000-01-01', '2030-12-31').all():
    logger.warning("Some dates are outside the expected range 2000-2030")
else:
    logger.info("All dates are within the expected range")

# 4.4 Result distribution
result_counts = df['ft_result'].value_counts(normalize=True)
logger.info(f"FT result distribution: {result_counts.to_dict()}")

# 4.5 Team representation
low_freq_teams = df['hometeam'].value_counts()[df['hometeam'].value_counts() <= 5]
if len(low_freq_teams) > 0:
    logger.warning(f"Teams with <= 5 matches: {low_freq_teams.to_dict()}")
else:
    logger.info("All teams have sufficient match representation")

# 5. Summary Report
report = {
    "total_rows": len(df),
    "total_columns": len(df.columns),
    "missing_values": df.isnull().sum().sum(),
    "numeric_cols_check": "PASS" if (df[numeric_cols] >= 0).all().all() else "FAIL",
    "date_range_check": "PASS" if df['date'].between('2000-01-01', '2030-12-31').all() else "FAIL",
    "score_consistency_check": "PASS" if len(score_inconsistency) == 0 else "FAIL"
}
logger.info(f"Validation Summary: {report}")

# 6. Save validation report
report_path = "logs/data_validation_report.csv"
pd.DataFrame([report]).to_csv(report_path, index=False)
logger.info(f"Validation report saved: {report_path}")



SyntaxError: invalid decimal literal (1804932208.py, line 6)