In [1]:
import sys, os
import pandas as pd

project_root = os.path.abspath(".")
if project_root not in sys.path:
    sys.path.append(project_root)

In [2]:
from connectors.database import DatabaseConnector
from config.settings import DB_URL

connector = DatabaseConnector(DB_URL)
engine = connector.connect()

In [3]:
from load.to_db import DataBaseLoader
from load.to_file import FileLoader
from extract.from_db import DatabaseExtractor
from extract.from_file import FileExtractor
from extract.from_API import ExchangeRateFetcher
from transform.add_quality import ADDQuality
from validate.quality_checks import DataValidator

In [4]:
csv_extractor = FileExtractor("/Users/mohamedmoaaz/Desktop/usecase")
source_data_frames = csv_extractor.extract_from_csv("source")
exchange_rate_extractor = ExchangeRateFetcher("https://openexchangerates.org/api/latest.json?app_id=a92f8bad8e044bc79949a676886da2c8")
exchange_rate_df = exchange_rate_extractor.fetch_exchange_rates()

2025-04-16 15:07:03 => ✅ Extracted 1445 rows from customers.csv in source
2025-04-16 15:07:03 => ✅ Extracted 7 rows from categories.csv in source
2025-04-16 15:07:03 => ✅ Extracted 334 rows from products.csv in source
2025-04-16 15:07:03 => ✅ Extracted 1615 rows from orders.csv in source
2025-04-16 15:07:03 => ✅ Extracted 10 rows from staffs.csv in source
2025-04-16 15:07:03 => ✅ Extracted 4764 rows from order_items.csv in source
2025-04-16 15:07:03 => ✅ Extracted 9 rows from brands.csv in source
2025-04-16 15:07:03 => ✅ Extracted 3 rows from stores.csv in source
2025-04-16 15:07:03 => ✅ Extracted 939 rows from stocks.csv in source
2025-04-16 15:07:04 =>   Fetched 169 exchange rates from openexchangerates


In [5]:
postgres_loader = DataBaseLoader(engine)
csv_loader = FileLoader("/Users/mohamedmoaaz/Desktop/usecase")
for file_name in source_data_frames.keys():
    if file_name == "orders.csv" or file_name == "order_items.csv":
        table_name = file_name.replace(".csv", "")
        postgres_loader.load_to_postgres(source_data_frames[file_name], table_name)
    else:
        csv_loader.load_to_csv(source_data_frames[file_name], "datalake", file_name)
csv_loader.load_to_csv(exchange_rate_df, "datalake", "exchange_rates.csv")

2025-04-16 15:07:04 => ✅ Loaded 1445 rows into customers.csv in datalake
2025-04-16 15:07:04 => ✅ Loaded 7 rows into categories.csv in datalake
2025-04-16 15:07:04 => ✅ Loaded 334 rows into products.csv in datalake
2025-04-16 15:07:04 => ✅ Loaded 1615 rows into orders in PostgreSQL
2025-04-16 15:07:04 => ✅ Loaded 10 rows into staffs.csv in datalake
2025-04-16 15:07:04 => ✅ Loaded 4764 rows into order_items in PostgreSQL
2025-04-16 15:07:04 => ✅ Loaded 9 rows into brands.csv in datalake
2025-04-16 15:07:04 => ✅ Loaded 3 rows into stores.csv in datalake
2025-04-16 15:07:04 => ✅ Loaded 939 rows into stocks.csv in datalake
2025-04-16 15:07:04 => ✅ Loaded 169 rows into exchange_rates.csv in datalake


In [6]:
csv_extractor = FileExtractor("/Users/mohamedmoaaz/Desktop/usecase")
data_lake_data = csv_extractor.extract_from_csv("datalake")

postgres_extractor = DatabaseExtractor(DB_URL)
query1 = "SELECT * FROM order_items"
order_data = postgres_extractor.extract_from_postgres(query1, engine)

query2 = "SELECT * FROM orders"
order_items_data = postgres_extractor.extract_from_postgres(query2, engine)

2025-04-16 15:07:04 => ✅ Extracted 1445 rows from customers.csv in datalake
2025-04-16 15:07:04 => ✅ Extracted 7 rows from categories.csv in datalake
2025-04-16 15:07:04 => ✅ Extracted 334 rows from products.csv in datalake
2025-04-16 15:07:04 => ✅ Extracted 169 rows from exchange_rates.csv in datalake
2025-04-16 15:07:04 => ✅ Extracted 10 rows from staffs.csv in datalake
2025-04-16 15:07:04 => ✅ Extracted 9 rows from brands.csv in datalake
2025-04-16 15:07:04 => ✅ Extracted 3 rows from stores.csv in datalake
2025-04-16 15:07:04 => ✅ Extracted 939 rows from stocks.csv in datalake
2025-04-16 15:07:04 => ✅ Extracted 4764 rows from PostgreSQL
2025-04-16 15:07:04 => ✅ Extracted 1615 rows from PostgreSQL


In [7]:
data_quality = ADDQuality()
csv_loader = FileLoader("/Users/mohamedmoaaz/Desktop/usecase")
for file_name in data_lake_data.keys():
    qualified_data = data_quality.add_source_and_date(data_lake_data[file_name], "data lake")
    csv_loader.load_to_csv(qualified_data, "landing", file_name)
qualified_data = data_quality.add_source_and_date(order_data, "postgres db")
csv_loader.load_to_csv(qualified_data, "landing", file_name)
qualified_data = data_quality.add_source_and_date(order_items_data, "postgres db")
csv_loader.load_to_csv(qualified_data, "landing", file_name)

2025-04-16 15:07:05 =>    Added source and extraction date to data
2025-04-16 15:07:05 => ✅ Loaded 1445 rows into customers.csv in landing
2025-04-16 15:07:05 =>    Added source and extraction date to data
2025-04-16 15:07:05 => ✅ Loaded 7 rows into categories.csv in landing
2025-04-16 15:07:05 =>    Added source and extraction date to data
2025-04-16 15:07:05 => ✅ Loaded 334 rows into products.csv in landing
2025-04-16 15:07:05 =>    Added source and extraction date to data
2025-04-16 15:07:05 => ✅ Loaded 169 rows into exchange_rates.csv in landing
2025-04-16 15:07:05 =>    Added source and extraction date to data
2025-04-16 15:07:05 => ✅ Loaded 10 rows into staffs.csv in landing
2025-04-16 15:07:05 =>    Added source and extraction date to data
2025-04-16 15:07:05 => ✅ Loaded 9 rows into brands.csv in landing
2025-04-16 15:07:05 =>    Added source and extraction date to data
2025-04-16 15:07:05 => ✅ Loaded 3 rows into stores.csv in landing
2025-04-16 15:07:05 =>    Added source and e

In [8]:
csv_extractor = FileExtractor("/Users/mohamedmoaaz/Desktop/usecase")
landing_data = csv_extractor.extract_from_csv("landing")
for file_name, df in landing_data.items():
    data_validator = DataValidator(df)
    data_validator.handle_nulls()
    data_validator.remove_duplicates()
    data_validator.validate_data()
    csv_loader.load_to_csv(df, "staging", file_name)

2025-04-16 15:07:06 => ✅ Extracted 1445 rows from customers.csv in landing
2025-04-16 15:07:06 => ✅ Extracted 7 rows from categories.csv in landing
2025-04-16 15:07:06 => ✅ Extracted 334 rows from products.csv in landing
2025-04-16 15:07:06 => ✅ Extracted 169 rows from exchange_rates.csv in landing
2025-04-16 15:07:06 => ✅ Extracted 10 rows from staffs.csv in landing
2025-04-16 15:07:06 => ✅ Extracted 9 rows from brands.csv in landing
2025-04-16 15:07:06 => ✅ Extracted 3 rows from stores.csv in landing
2025-04-16 15:07:06 => ✅ Extracted 1615 rows from stocks.csv in landing
2025-04-16 15:07:06 => ✅ Handled null values in columns: None with fill value: 0
2025-04-16 15:07:06 => ✅ Removed 0 duplicate rows
2025-04-16 15:07:06 => ✅ Loaded 1445 rows into customers.csv in staging
2025-04-16 15:07:06 => ✅ Handled null values in columns: None with fill value: 0
2025-04-16 15:07:06 => ✅ Removed 0 duplicate rows
2025-04-16 15:07:06 => ✅ Loaded 7 rows into categories.csv in staging
2025-04-16 15:07