**DATA INGESTION**

1. Go through the url workbook
2. Ascertain that the number of rows and columns are as expected
3. Implement a customer logger
4. Load the pd.concat dataset back into the environment


In [None]:
import pandas as pd
from pathlib import Path

# ===== Defining the data directories ====#
PROJECT_ROOT = Path().absolute().parent.parent
DATA_DIR = PROJECT_ROOT / "datasets" 
COMMON_DATA_DIR = DATA_DIR / "common_data"

#print(f"üìÅ Project root: {PROJECT_ROOT}")
#print(f"üìÅ Data directory: {DATA_DIR}")
#print(f"üìÅ Common data directory: {COMMON_DATA_DIR}")





In [None]:
# ===== Loading the url datasets ==== #

csv_file = COMMON_DATA_DIR / "english_league_data_url.csv"
try:
    if csv_file.exists():
        print(f"‚úÖ Found file: {csv_file}")
        #proceed to load it
        urls = pd.read_csv(csv_file)
        print(f'üìäFile has been loaded successfully. The url dataset has {urls.shape[0]} rows and {urls.shape[1]} columns')
    else:
        print(f"‚ùå File not found: {csv_file}")
except Exception as e:
    print(f"Operation unsuccessful. Reason : {e}")

In [None]:
# ===== Adding season_id and competition_name to datasets ==== #

if urls is not None:
    print(f"\nüöÄ Starting enhanced data download...")
    print("="*60)

    dataframes = []

        #iterate through each row in the urls datasets
    for index,row in urls.iterrows():
        season_id = row["Season_ID"]
        seasons_url = row["Seasons_url"]
        competition_name = row["Competition_name"]
        print(f"\nüì• Processing: {season_id} - {competition_name}")
        print(f"üîó URL: {seasons_url}")

        try:
            epl_data = pd.read_csv(seasons_url)

            #Add metadata columns before adding
            epl_data["season_id"] = season_id
            epl_data["competition_name"] = competition_name
            
            dataframes.append(epl_data)
            print(f"   ‚úÖ Success: {epl_data.shape[0]} matches loaded")

            #Possible tests -- checking if 380 matches are loaded

            print(f"   üìä Columns: {epl_data.shape[1]} (including metadata)")
        except Exception as e:
            print(f" ‚ùå Error reading data from {seasons_url}: {e}")


üöÄ Starting enhanced data download...

üì• Processing: 2024-2025 - EPL
üîó URL: https://www.football-data.co.uk/mmz4281/2425/E0.csv
   ‚úÖ Success: 380 matches loaded
   üìä Columns: 122 (including metadata)

üì• Processing: 2023-2024 - EPL
üîó URL: https://www.football-data.co.uk/mmz4281/2324/E0.csv
   ‚úÖ Success: 380 matches loaded
   üìä Columns: 108 (including metadata)

üì• Processing: 2022-2023 - EPL
üîó URL: https://www.football-data.co.uk/mmz4281/2223/E0.csv
   ‚úÖ Success: 380 matches loaded
   üìä Columns: 108 (including metadata)

üì• Processing: 2021-2022 - EPL
üîó URL: https://www.football-data.co.uk/mmz4281/2122/E0.csv
   ‚úÖ Success: 380 matches loaded
   üìä Columns: 108 (including metadata)

üì• Processing: 2020-2021 - EPL
üîó URL: https://www.football-data.co.uk/mmz4281/2021/E0.csv
   ‚úÖ Success: 380 matches loaded
   üìä Columns: 108 (including metadata)

üì• Processing: 2019-2020 - EPL
üîó URL: https://www.football-data.co.uk/mmz4281/1920/E0.cs

In [None]:
#Combine with the block above

if dataframes:
    print(f"\nüîÑ Combining all datasets...")
    final_df = pd.concat(dataframes, ignore_index=True)
    print(f"\nüéâ Data successfully ingested and concatenated!")
    print(f"üìä Final dataset shape: {final_df.shape}")

    #Show summary by season and competition
    print(f"\nüìà Summary by Season and Competition:")
    summary = final_df.groupby(['season_id', 'competition_name']).size().reset_index(name='match_count')
    print(summary.to_string(index=False))

    #Deleting the Division Column
    final_df = final_df.drop('Div',axis=1)
    
    # Show sample of the final dataset with metadata
    print(f"\nüìã Sample of final dataset (with metadata):")
    display_columns = ['season_id', 'competition_name', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR']
    available_columns = [col for col in display_columns if col in final_df.columns]
    print(final_df[available_columns].head(10).to_string(index=False))


    #Quick Validation
    print(f"\nüîç Data Validation:")
    print(f"   üìä Total matches: {len(final_df):,}")
    print(f"   üèÜ Unique seasons: {final_df['season_id'].nunique()}")
    print(f"   üèüÔ∏è Unique competitions: {final_df['competition_name'].nunique()}")


    #Save the final dataset
    output_file = DATA_DIR / "ingested" / "combined_football_data.csv"
    output_file.parent.mkdir(exist_ok=True)
    final_df.to_csv(output_file,index=False)
    print(f"\nüíæ Enhanced dataset saved to: {output_file}")

else:
    print(f"\n‚ùå No dataframes were successfully loaded.")






In [None]:
#==========REFERENCE==========#



# %%
from pathlib import Path
import os
import pandas as pd
import numpy as np

print("‚úÖ Imports completed!")


PROJECT_ROOT = Path().absolute().parent.parent
DATA_DIR = PROJECT_ROOT / "datasets" 
COMMON_DATA_DIR = DATA_DIR / "common_data"

print(f"üìÅ Project root: {PROJECT_ROOT}")
print(f"üìÅ Data directory: {DATA_DIR}")
print(f"üìÅ Common data directory: {COMMON_DATA_DIR}")


csv_file = COMMON_DATA_DIR / "english_league_data_url.csv"

# Check if file exists
if csv_file.exists():
    print(f"‚úÖ Found file: {csv_file}")
    
    # Load the CSV file
    urls = pd.read_csv(csv_file)
    print(f"üìä File loaded successfully! Shape: {urls.shape}")
else:
    print(f"‚ùå File not found: {csv_file}")
    # Let's check what files are actually in that directory
    print(f"üìÅ Files in {COMMON_DATA_DIR}:")
    if COMMON_DATA_DIR.exists():
        for file in COMMON_DATA_DIR.iterdir():
            print(f"   ‚Ä¢ {file.name}")
    else:
        print("   Directory doesn't exist!")


print("üìä BASIC DATA OVERVIEW")
print("=" * 50)

if 'urls' in locals():
    # Show the shape
    print(f"üìê Dataset shape: {urls.shape[0]} rows √ó {urls.shape[1]} columns")
    
    # Show the first few rows
    print(f"\nüìã First 5 rows:")
    print(urls.head())
    
    # Show column names
    print(f"\nüè∑Ô∏è Column names:")
    for i, col in enumerate(urls.columns, 1):
        print(f"   {i}. {col}")


print("üîç COLUMN INFORMATION")
print("=" * 50)

if 'urls' in locals():
    # Data types
    print("üìä Data types:")
    for col, dtype in urls.dtypes.items():
        print(f"   ‚Ä¢ {col}: {dtype}")
    
    # Basic info
    print(f"\nüìà Basic statistics:")
    print(f"   ‚Ä¢ Total columns: {len(urls.columns)}")
    print(f"   ‚Ä¢ Total rows: {len(urls)}")
    print(f"   ‚Ä¢ Memory usage: {urls.memory_usage(deep=True).sum() / 1024:.2f} KB")

print("üìã COLUMN CONTENT EXPLORATION") 
print("=" * 50)

if 'urls' in locals():
    for col in urls.columns:
        print(f"\nüè∑Ô∏è Column: '{col}'")
        print(f"   üìä Data type: {urls[col].dtype}")
        print(f"   üî¢ Non-null count: {urls[col].count()}/{len(urls)}")
        print(f"   üîó Unique values: {urls[col].nunique()}")
        
        # Show sample values
        non_null_values = urls[col].dropna()
        if len(non_null_values) > 0:
            print(f"   üìÑ Sample values:")
            # Show up to 5 unique sample values
            sample_values = non_null_values.unique()[:5]
            for i, value in enumerate(sample_values, 1):
                # Truncate long values
                value_str = str(value)
                if len(value_str) > 60:
                    value_str = value_str[:60] + "..."
                print(f"      {i}. {value_str}")
            
            if len(non_null_values.unique()) > 5:
                print(f"      ... and {len(non_null_values.unique()) - 5} more unique values")
        
        print("-" * 30)

print("‚ùì MISSING VALUES CHECK")
print("=" * 50)

if 'urls' in locals():
    missing_values = urls.isnull().sum()
    total_rows = len(urls)
    
    print(f"üìä Missing values per column:")
    for col in urls.columns:
        missing_count = missing_values[col]
        missing_pct = (missing_count / total_rows) * 100
        
        if missing_count > 0:
            print(f"   ‚ùå {col}: {missing_count} ({missing_pct:.1f}%)")
        else:
            print(f"   ‚úÖ {col}: 0 (0.0%)")
print("üîó URL-LIKE DATA DETECTION")
print("=" * 50)

if 'urls' in locals():
    url_columns = []
    
    for col in urls.columns:
        # Check if column contains URL-like strings
        sample_values = urls[col].dropna().astype(str)
        
        if len(sample_values) > 0:
            # Look for http/https or www patterns
            url_like_count = sample_values.str.contains(r'http|www|\.com|\.co\.uk|\.csv', case=False, na=False).sum()
            url_like_pct = (url_like_count / len(sample_values)) * 100
            
            print(f"üè∑Ô∏è {col}:")
            print(f"   üîó URL-like entries: {url_like_count}/{len(sample_values)} ({url_like_pct:.1f}%)")
            
            if url_like_pct > 50:  # If more than 50% look like URLs
                url_columns.append(col)
                print(f"   ‚úÖ This looks like a URL column!")
                
                # Show some examples
                url_examples = sample_values[sample_values.str.contains(r'http|www|\.com|\.co\.uk|\.csv', case=False, na=False)].head(3)
                print(f"   üìÑ Examples:")
                for i, url in enumerate(url_examples, 1):
                    print(f"      {i}. {url}")
            
            print()
    
    if url_columns:
        print(f"üéØ Found {len(url_columns)} URL column(s): {url_columns}")
    else:
        print("ü§î No obvious URL columns found")

print("üíæ SAVING EXPLORATION RESULTS")
print("=" * 50)

if 'urls' in locals():
    # Create a simple summary
    exploration_summary = {
        'file_name': csv_file.name,
        'shape': urls.shape,
        'columns': list(urls.columns),
        'data_types': urls.dtypes.to_dict(),
        'missing_values': urls.isnull().sum().to_dict()
    }
    
    print("üìã Exploration Summary:")
    print(f"   üìÑ File: {exploration_summary['file_name']}")
    print(f"   üìä Shape: {exploration_summary['shape']}")
    print(f"   üè∑Ô∏è Columns: {len(exploration_summary['columns'])}")
    
    # Save the original data info for next steps
    print(f"\n‚úÖ Data loaded and ready for next steps!")
    print(f"   Variable 'urls' contains your data")
    print(f"   Shape: {urls.shape}")
    print(f"   Ready to proceed to next notebook/analysis")


if 'urls' in locals():
    print("üéØ QUICK DATA PEEK FOR NEXT STEPS")
    print("=" * 40)
    
    print("üìä Dataset sample (first 3 rows, all columns):")
    print(urls.head(3).to_string())
    




**Implementing a logger** (Future works)


In [None]:
from loguru import logger
logger.info("Hello World")
logger.trace("Hello World")
logger.debug("Hello World")
logger.success("Hello World")
logger.warning("Hello World")
logger.error("Hello World")



[32m2025-08-18 04:21:25.187[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mHello World[0m
[32m2025-08-18 04:21:25.188[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [34m[1mHello World[0m
[32m2025-08-18 04:21:25.189[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [32m[1mHello World[0m
[32m2025-08-18 04:21:25.192[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [31m[1mHello World[0m
[32m2025-08-18 04:21:25.194[0m | [41m[1mCRITICAL[0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [41m[1mHello World[0m


In [17]:
import sys

logger.add(sys.stderr, format ="<green>{time}</green> | {level} | {message}")
logger.error("Hello World")

[32m2025-08-18T04:26:04.044740+0300[0m | ERROR | Hello World
[32m2025-08-18T04:26:04.044740+0300[0m | ERROR | Hello World
[32m2025-08-18T04:26:04.044740+0300[0m | ERROR | Hello World
