In [2]:
import logging
import pandas as pd
from sqlalchemy import create_engine, text
from sqlalchemy.engine.url import URL

# ───────────────────────────────────────────────────────────────────────────────
# 0) USER CONFIGURATION: set these to your environment values
# ───────────────────────────────────────────────────────────────────────────────
SQL_SERVER_HOST     = "WENLINHNBB"   # ← your server or instance (e.g. "localhost\\SQLEXPRESS")
SQL_SERVER_PORT     = 1433           # ← usually 1433
SQL_SERVER_DATABASE = "RLS162Demo"   # ← your database name
# ───────────────────────────────────────────────────────────────────────────────

# ───────────────────────────────────────────────────────────────────────────────
# Configure logging
# ───────────────────────────────────────────────────────────────────────────────
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)
logger = logging.getLogger(__name__)

def get_sqlalchemy_engine():
    if SQL_SERVER_HOST in ("", "YOUR_SQL_SERVER_HOST_HERE"):
        raise ValueError("Set SQL_SERVER_HOST to your actual server/instance.")
    if SQL_SERVER_DATABASE in ("", "YOUR_DATABASE_NAME_HERE"):
        raise ValueError("Set SQL_SERVER_DATABASE to your actual database name.")

    driver_params = {
        "driver": "ODBC Driver 17 for SQL Server",
        "Trusted_Connection": "yes"
    }
    logger.info("Using Windows Integrated Authentication (Trusted_Connection=yes).")
    connection_url = URL.create(
        "mssql+pyodbc",
        username=None,
        password=None,
        host=SQL_SERVER_HOST,
        port=int(SQL_SERVER_PORT),
        database=SQL_SERVER_DATABASE,
        query=driver_params,
    )
    try:
        engine = create_engine(connection_url, fast_executemany=True, echo=False)
        with engine.connect() as conn:
            version = conn.execute(text("SELECT @@VERSION")).scalar()
            logger.info(f"Connected to '{SQL_SERVER_DATABASE}'. Server version: {version.splitlines()[0]}")
        return engine
    except Exception as e:
        raise ConnectionError(f"Could not create SQLAlchemy engine: {e}")

def extract_all_tables(engine):
    table_map = {
        "customer":         ("SLS3", "Customer"),
        "invoice":          ("SLS3", "Invoice"),
        "invoiceitem":      ("SLS3", "InvoiceItem"),
        "product":          ("SLS3", "Product"),
        "employee":         ("HCM3", "Employee"),
        "inventoryvoucher": ("LGS3", "InventoryVoucher"),
        "receivablenote":   ("RPA3", "ReceivableNote")
    }
    dfs = {}
    for key, (schema, tbl) in table_map.items():
        fq = f"{schema}.[{tbl}]"
        logger.info(f"Extracting '{fq}' …")
        try:
            dfs[key] = pd.read_sql(f"SELECT * FROM {fq}", engine)
        except Exception as e:
            raise RuntimeError(f"Failed to extract {fq}: {e}")
    logger.info("Extracted all seven tables.")
    return dfs

def transform_sales_fact(dfs):
    cust_df  = dfs["customer"]
    inv_df   = dfs["invoice"]
    items_df = dfs["invoiceitem"]
    prod_df  = dfs["product"]

    # Merge InvoiceItem → Invoice on InvoiceRef → InvoiceID
    merged = items_df.merge(
        inv_df[["InvoiceID", "Date", "CustomerRef"]],
        how="left",
        left_on="InvoiceRef",
        right_on="InvoiceID",
        validate="many_to_one"
    )
    # Merge Customer on CustomerRef → CustomerID, pulling Number, Type, CurrencyRef
    merged = merged.merge(
        cust_df[["CustomerID", "Number", "Type", "CurrencyRef"]],
        how="left",
        left_on="CustomerRef",
        right_on="CustomerID",
        validate="many_to_one"
    )
    # Merge Product on ProductRef → ProductID, pulling Name, Number, UnitRef, PriceBaseUnitRef
    merged = merged.merge(
        prod_df[["ProductID", "Name", "Number", "UnitRef", "PriceBaseUnitRef"]],
        how="left",
        left_on="ProductRef",
        right_on="ProductID",
        validate="many_to_one"
    )

    # Rename columns: Date→InvoiceDate, Number_x→CustomerNumber, Number_y→ProductNumber, Name→ProductName
    salesfact = merged.rename(columns={
        "Date": "InvoiceDate",
        "Number_x": "CustomerNumber",
        "Number_y": "ProductNumber",
        "Name": "ProductName"
    })

    # Coerce numeric columns if present
    for col in ("Quantity", "Price", "NetPrice"):
        if col in salesfact.columns:
            salesfact[col] = pd.to_numeric(salesfact[col], errors="coerce")

    logger.info(f"SalesFact transformation complete (rows = {len(salesfact):,}).")
    return salesfact

def load_salesfact_to_sql(df, engine):
    try:
        logger.info("Loading SalesFact into dbo.SalesFact (replace if exists)…")
        df.to_sql(
            name="SalesFact",
            schema="dbo",
            con=engine,
            if_exists="replace",
            index=False,
            method="multi",
            chunksize=5000
        )
        logger.info(f"dbo.SalesFact loaded (rows = {len(df):,}).")
    except Exception as e:
        raise RuntimeError(f"Failed to load dbo.SalesFact: {e}")

def save_all_snapshots(dfs, salesfact_df):
    try:
        for key, df in dfs.items():
            name = key[0].upper() + key[1:]
            df.to_csv(f"{name}.csv", index=False)
            df.to_parquet(f"{name}.parquet", index=False)
            logger.info(f"Saved snapshot for {name}.")
        salesfact_df.to_csv("SalesFact_snapshot.csv", index=False)
        salesfact_df.to_parquet("SalesFact_snapshot.parquet", index=False)
        logger.info("Saved SalesFact snapshots.")
    except Exception as e:
        raise RuntimeError(f"Failed to save snapshots: {e}")

def run_full_etl():
    engine = None
    try:
        engine = get_sqlalchemy_engine()
        dfs = extract_all_tables(engine)
        salesfact_df = transform_sales_fact(dfs)
        load_salesfact_to_sql(salesfact_df, engine)
        save_all_snapshots(dfs, salesfact_df)
        logger.info("=== ETL PIPELINE COMPLETED SUCCESSFULLY ===")
    except Exception as exc:
        logger.error(f"ETL aborted due to exception:\n{exc}")
    finally:
        if engine is not None:
            try:
                engine.dispose()
                logger.info("Disposed engine.")
            except:
                pass

# Execute the ETL pipeline
run_full_etl()


2025-06-06 07:51:43 - INFO - Using Windows Integrated Authentication (Trusted_Connection=yes).
2025-06-06 07:51:43 - INFO - Connected to 'RLS162Demo'. Server version: Microsoft SQL Server 2022 (RTM-GDR) (KB5046861) - 16.0.1135.2 (X64) 
2025-06-06 07:51:43 - INFO - Extracting 'SLS3.[Customer]' …
2025-06-06 07:51:43 - INFO - Extracting 'SLS3.[Invoice]' …
2025-06-06 07:51:43 - INFO - Extracting 'SLS3.[InvoiceItem]' …
2025-06-06 07:51:43 - INFO - Extracting 'SLS3.[Product]' …
2025-06-06 07:51:43 - INFO - Extracting 'HCM3.[Employee]' …
2025-06-06 07:51:43 - INFO - Extracting 'LGS3.[InventoryVoucher]' …
2025-06-06 07:51:43 - INFO - Extracting 'RPA3.[ReceivableNote]' …
2025-06-06 07:51:43 - INFO - Extracted all seven tables.
2025-06-06 07:51:43 - INFO - SalesFact transformation complete (rows = 1).
2025-06-06 07:51:43 - INFO - Loading SalesFact into dbo.SalesFact (replace if exists)…
2025-06-06 07:51:44 - INFO - dbo.SalesFact loaded (rows = 1).
2025-06-06 07:51:44 - INFO - Saved snapshot for 

In [None]:
!pip install prophet


In [None]:
from prophet import Prophet


In [None]:
# First, define the sales_fact DataFrame
# For example, you can load it from a CSV file or create it from scratch
import pandas as pd

# Option 1: Load from a file
# sales_fact = pd.read_csv('your_sales_data.csv')

# Option 2: Create a sample DataFrame for testing
sales_fact = pd.DataFrame({
    'invoice_date': pd.date_range(start='2023-01-01', periods=10),
    'net_line_price': [100, 150, 200, 120, 180, 250, 300, 220, 190, 210]
})

# Now the original code will work
# 1) Group "SalesFact" by date (day‐level) and sum net_line_price
daily = (
    sales_fact
    .groupby(sales_fact["invoice_date"].dt.floor("D"))["net_line_price"]
    .sum()
    .reset_index()
    .rename(columns={"invoice_date": "ds", "net_line_price": "y"})
)

In [None]:
m = Prophet(
    growth="linear",
    daily_seasonality=False,  # turn off auto daily, since we only have daily
    weekly_seasonality=True,
    yearly_seasonality=True,
    seasonality_mode="additive"
)
m.fit(daily)

# Build a DataFrame of future dates: next 30 days (or project range you like)
future = m.make_future_dataframe(periods=30, freq="D")

# Forecast
forecast = m.predict(future)
# forecast contains: ds, yhat, yhat_lower, yhat_upper, plus trend and seasonality


In [None]:
# ------------------------------
# 1.4 Compute residuals & save
# ------------------------------

import os

# Redefine OUTPUT_FOLDER here (exactly as in your config section):
OUTPUT_FOLDER = r"E:\decision_intelligent\task2\output_files\RLS162Demo"

# Create the output directory if it doesn't exist
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)  # This will create all intermediate directories as needed

# 'daily' and 'forecast' must already exist from the previous steps.

# 1) Compute residuals: actual minus predicted, only for dates that exist in 'daily'
hist = forecast.merge(daily, on="ds", how="left")
hist["residual"] = hist["y"] - hist["yhat"]

# 2) Save the residual file
residuals_path = os.path.join(OUTPUT_FOLDER, "sales_trend_residuals.csv")
hist[["ds", "y", "yhat", "residual"]].to_csv(residuals_path, index=False)

logger.info(f"Saved daily residuals CSV to: {residuals_path}")