# 01 Data Collection and Cleaning

This notebook loads the raw datasets, performs initial cleaning and stores cleaned data frames. We display summary information to understand the data structure.

In [7]:
# Imports and path setup
import sys
from pathlib import Path

# Add project root to path
project_root = Path('/home/sayda/update')
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

import pandas as pd
import numpy as np
from src.data import loaders, cleaning
from src.config import RAW_DATA_DIR

In [8]:
# Load raw datasets
world_raw = loaders.load_world_primary_energy(RAW_DATA_DIR)
renew_raw = loaders.load_renewables(RAW_DATA_DIR)
saudi_raw = loaders.load_saudi_crude(RAW_DATA_DIR)

In [9]:
# Show raw shapes
print("World raw shape:", world_raw.shape)
print("Renewables raw shape:", renew_raw.shape)
print("Saudi raw shape:", saudi_raw.shape)

World raw shape: (6652400, 7)
Renewables raw shape: (133, 9)
Saudi raw shape: (183, 5)


In [10]:
# Preview raw data heads
print(world_raw.head())
print(renew_raw.head())
print(saudi_raw.head())

  REF_AREA TIME_PERIOD ENERGY_PRODUCT FLOW_BREAKDOWN UNIT_MEASURE  OBS_VALUE  \
0       AE     2002-01       CRUDEOIL        CLOSTLV      CONVBBL  7596.0000   
1       AE     2002-01       CRUDEOIL        CLOSTLV         KBBL          -   
2       AE     2002-01       CRUDEOIL        CLOSTLV          KBD          x   
3       AE     2002-01       CRUDEOIL        CLOSTLV           KL          -   
4       AE     2002-01       CRUDEOIL        CLOSTLV        KTONS          -   

   ASSESSMENT_CODE  
0                3  
1                3  
2                3  
3                3  
4                3  
   TIME_PERIOD REF_AREA    FREQ UNIT_MEASURE  \
0         2014      UAE  Annual           MW   
1         2017      UAE  Annual           MW   
2         2019      UAE  Annual           MW   
3         2011      UAE  Annual           MW   
4         2014      UAE  Annual           MW   

                           SOURCE_DETAIL  \
0  Ministry of Energy and Infrastructure   
1  Ministry of E

In [11]:
# Clean datasets
world = cleaning.clean_world_primary_energy(world_raw)
renew = cleaning.clean_renewables(renew_raw)
saudi = cleaning.clean_saudi_crude(saudi_raw)

In [12]:
# Inspect cleaned columns
print("Cleaned world columns:", world.columns.tolist())
print("Cleaned renewables columns:", renew.columns.tolist())
print("Cleaned Saudi columns:", saudi.columns.tolist())

Cleaned world columns: ['ref_area', 'time_period', 'energy_product', 'flow_breakdown', 'unit_measure', 'obs_value', 'assessment_code', 'date']
Cleaned renewables columns: ['time_period', 'ref_area', 'freq', 'unit_measure', 'source_detail', 're_measure', 'plant_type', 'value', 'decimals', 'date']
Cleaned Saudi columns: ['year', 'production_indicator', 'value', 'periodicity', 'date_object', 'date']
