In [1]:
import uuid
import numpy as np
import pandas as pd
from datetime import datetime
from google.cloud import bigquery
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import root_mean_squared_error

In [2]:
CORRECTION_DATE = datetime.today().strftime('%Y-%m-%d')
CORRECTION_ID = str(uuid.uuid4())

MAX_LAG = 2
APLHA_VALUES = np.linspace(0, 0.9, 10)

DATASET = "b2b_wf_prediction"
OUTPUT_MARKETING_TABLE = "bq_wf_marketing_corrections"
MARKETING_RGU_TABLE = 'bq_wf_marketing'

In [3]:
PROJECT_REGION = ''
PROJECT_ID =''

In [4]:
PROJECT_ID = "wb-ai-acltr-tbs-3-pr-a62583"
PROJECT_REGION = "northamerica-northeast1"

In [5]:
class SeriesOptimizer:

    def __init__(self, fwds_series, market_series):
        self.fwds_series = fwds_series
        self.market_series = market_series
        self.scale_columns = ['SWT', 'rgu_values']
        self.corrected_series_columns = ['Appointment_Month', 'SWT', 'SWT_Type']

        self.load_scaler()

    def load_scaler(self):
        self.scaler = MinMaxScaler()

    def optimize_series(self, lag, alpha):

        lag_investiments = self.market_series.copy()
        lag_investiments['Appointment_Month'] += pd.DateOffset(months=lag)

        merged_series = pd.merge(
            self.fwds_series, lag_investiments, on='Appointment_Month', how='left')

        error_series = self.get_series_error(merged_series)

        corrected_series = self.correct_series(error_series, alpha)

        return corrected_series[self.corrected_series_columns]

    def scale_series(self, series, columns):
        series[columns] = self.scaler.fit_transform(series[columns])
        return series

    def inverse_scale_series(self, series, columns) -> pd.DataFrame:
        series[columns] = self.scaler.inverse_transform(series[columns])
        return series

    def get_series_error(self, series):
        
        series = self.scale_series(
            series, self.scale_columns)

        mask = series['rgu_values'].isna()
        series.loc[mask, 'rgu_values'] = series.loc[mask, 'SWT']

        series['Error'] = series['SWT'] - series['rgu_values']

        return series

    def correct_series(self, series, alpha):

        predicted_mask = series['SWT_Type'] == 'Predicted'
        series.loc[predicted_mask, 'SWT'] = series.loc[predicted_mask, 'SWT'] - \
            alpha * series.loc[predicted_mask, 'Error']

        series = self.inverse_scale_series(
            series, self.scale_columns)

        return series

    @classmethod
    def calculate_rmse(cls, series):
        values = series.pivot(
            index='Appointment_Month', columns='SWT_Type', values='SWT').dropna(axis=0)
        
        return root_mean_squared_error(
            values['Actual'],
            values['Predicted']
        )


In [6]:
class SeriesProcessor:

    def __init__(self):
        pass

    def load_swt_data(self, dataframe):
        self.swt_data = dataframe.copy()

    def load_marketing_data(self, marketing_data):
        self.marketing_data = marketing_data.copy()

    def get_swt_data_for_product(self, product_name, forecast_date) -> None:

        fwd_product_data = self.swt_data[
            (self.swt_data['Forecast_Date'] == forecast_date) |
            (self.swt_data['SWT_Type'] == 'Actual')
        ]

        product_data = fwd_product_data[
            fwd_product_data['Product'] == product_name
        ].copy()

        product_data.drop(columns=['Product'], inplace=True)
        product_data.sort_values(by='Appointment_Month', inplace=True)

        self.product_fwds_data = product_data

    def get_marketing_data_for_product(self, product_name, rgu_column = 'Forecasted_RGU'):

        marketing_data = self.marketing_data.copy()

        marketing_data = marketing_data[marketing_data['Product']
                                  == product_name]

        marketing_data['rgu_values'] = marketing_data['Actual_RGU'] + marketing_data[rgu_column]

        marketing_data = marketing_data[['Appointment_Month', 'rgu_values']]

        self.product_marketing_data = marketing_data.groupby(
            ['Appointment_Month']).sum().reset_index()

    def is_marketing_product(self, product_name):
        return product_name in self.marketing_data['Product'].unique()


In [7]:
def grid_search_optimization(max_lag, alpha_values, series_optimizer):
    
    results = []
    
    for lag in range(1, max_lag):
        for alpha in alpha_values:
            optimized_series = series_optimizer.optimize_series(lag, alpha)
            rmse = series_optimizer.calculate_rmse(optimized_series)
            results.append({
                'lag': lag,
                'alpha': alpha,
                'rmse': rmse,
                'series': optimized_series
            })
            
    return min(results, key=lambda x: x['rmse'])

In [24]:
client = bigquery.Client(project=PROJECT_ID, location=PROJECT_REGION)

In [25]:
PRODUCT_QUERY = f"""WITH Historical_Data AS (
  SELECT
   TIMESTAMP_TRUNC(Appointment_Timestamp, MONTH) AS Appointment_Month,
    CAST(NULL AS STRING) AS Forecast_Date,
    Product,
    SUM(SWT) AS SWT,
    'Actual' AS SWT_Type,
  FROM
    `{PROJECT_ID}.{DATASET}.bq_wf_historical`
  WHERE Work_Order_Action = 'INSTALL'
  GROUP BY Appointment_Month, Product
), Forecast_Data AS (
  SELECT
    Appointment_Month,
    CAST(Forecast_Date AS STRING) AS Forecast_Date,
    Product,
    SUM(predicted_SWT.value) AS SWT,
    'Predicted' AS SWT_Type
  FROM
    `bi-stg-aaaie-pr-750ff7.{DATASET}.bq_wf_forecast`
  WHERE Work_Order_Action = 'INSTALL'
  GROUP BY Appointment_Month, Product, Forecast_Date
)
SELECT
  CAST(Appointment_Month AS DATE) AS Appointment_Month,
  Forecast_Date,
  Product,
  SWT,
  SWT_Type,
FROM
  Historical_Data
UNION ALL
SELECT
  CAST(Appointment_Month AS DATE) AS Appointment_Month,
  Forecast_Date,
  Product,
  SWT,
  SWT_Type,
FROM
  Forecast_Data"""

fwds_data = client.query_and_wait(PRODUCT_QUERY).to_dataframe()
fwds_data['Appointment_Month'] = pd.to_datetime(fwds_data['Appointment_Month'])

KeyboardInterrupt: 

In [8]:
fwds_data = pd.read_csv('mkt_pred.csv')
fwds_data['Appointment_Month'] = pd.to_datetime(fwds_data['Appointment_Month'])
fwds_data.head()

Unnamed: 0,Appointment_Month,Forecast_Date,Product,SWT,SWT_Type
0,2023-09-01,,PRIVATE LINE,529.0,Actual
1,2022-11-01,,SECURITY,5313.0,Actual
2,2022-05-01,,SECURITY,5340.75,Actual
3,2023-06-01,,SECURITY,4005.25,Actual
4,2022-05-01,,IPTV,244.29,Actual


In [9]:
TUNING_FORECAST = '2024-11-12 07:43:15.121038'
FORECAST_DATE = '2025-03-13 17:57:05.454801'

In [11]:
MODELS_DATE_QUERY = f"""SELECT DISTINCT Forecast_Date as Forecast_Date
                        FROM `bi-stg-aaaie-pr-750ff7.{DATASET}.bq_wf_forecast`
                        ORDER BY Forecast_Date DESC"""

forecast_dates = client.query_and_wait(MODELS_DATE_QUERY).to_dataframe()
TUNING_FORECAST = forecast_dates['Forecast_Date'].iloc[5]
FORECAST_DATE = forecast_dates['Forecast_Date'].iloc[0]

KeyboardInterrupt: 

In [10]:
rgu_data = pd.read_csv('mkt_data.csv')
rgu_data['Appointment_Month'] = pd.to_datetime(rgu_data['Appointment_Month'])
rgu_data.head()

Unnamed: 0,Appointment_Month,Product,Actual_RGU,Forecasted_RGU,Adjusted_RGU
0,2025-03-01,BUSINESS INTERNET,102,100,100
1,2025-03-01,NAAS,7,16,16
2,2025-03-01,HSIA,2798,3281,3281
3,2025-03-01,BUSINESS CONNECT,179,4726,213
4,2025-03-01,PRIVATE LINE,411,494,494


In [11]:
MARKETING_QUERY = f"""SELECT * FROM `{PROJECT_ID}.{DATASET}.{MARKETING_RGU_TABLE}`"""

rgu_data = client.query_and_wait(MARKETING_QUERY).to_dataframe()
rgu_data['Appointment_Month'] = pd.to_datetime(rgu_data['Appointment_Month'])

In [11]:
PRODUCT_LIST = fwds_data.Product.unique()

In [12]:
series_processor = SeriesProcessor()
series_processor.load_swt_data(fwds_data)
series_processor.load_marketing_data(rgu_data)

In [13]:
product_corrections = []
for product in PRODUCT_LIST:

    if not series_processor.is_marketing_product(product):
        continue
    
    series_processor.get_swt_data_for_product(product, forecast_date=TUNING_FORECAST)
    series_processor.get_marketing_data_for_product(product, 'Forecasted_RGU')
    
    series_optimizer = SeriesOptimizer(
        fwds_series=series_processor.product_fwds_data,
        market_series=series_processor.product_marketing_data
    )
    
    best_correction_forecasted = grid_search_optimization(MAX_LAG, APLHA_VALUES, series_optimizer)
    
    series_processor.get_marketing_data_for_product(product, 'Adjusted_RGU')
    
    series_optimizer = SeriesOptimizer(
        fwds_series=series_processor.product_fwds_data,
        market_series=series_processor.product_marketing_data
    )

    best_correction_adjusted = grid_search_optimization(MAX_LAG, APLHA_VALUES, series_optimizer)    

    product_corrections.append(
        {
            'Product': product,
            'lag_forecasted': best_correction_forecasted['lag'],
            'alpha_forecasted': best_correction_forecasted['alpha'],
            'lag_adjusted': best_correction_adjusted['lag'],
            'alpha_adjusted': best_correction_adjusted['alpha']
        }
    )

In [32]:
job_config = bigquery.LoadJobConfig(
    autodetect=True,
    write_disposition="WRITE_APPEND",
)

In [15]:
len(product_corrections)

13

In [17]:
for correction in product_corrections:

    product = correction['Product']
    
    series_processor.get_swt_data_for_product(product, forecast_date=FORECAST_DATE)
    series_processor.get_marketing_data_for_product(product, 'Forecasted_RGU')
    
    series_optimizer = SeriesOptimizer(
        fwds_series=series_processor.product_fwds_data,
        market_series=series_processor.product_marketing_data
    )
    
    best_correction_forecasted = series_optimizer.optimize_series(correction['lag_forecasted'], correction['alpha_forecasted'])
    
    lag_mkt_series_forecasted = series_processor.product_marketing_data.copy()
    lag_mkt_series_forecasted['Appointment_Month'] += pd.DateOffset(months=correction['lag_forecasted'])


    series_processor.get_marketing_data_for_product(product, 'Adjusted_RGU')
    
    series_optimizer = SeriesOptimizer(
        fwds_series=series_processor.product_fwds_data,
        market_series=series_processor.product_marketing_data
    )
    
    best_correction_adjusted = series_optimizer.optimize_series(correction['lag_adjusted'], correction['alpha_adjusted'])

    lag_mkt_series_adjusted = series_processor.product_marketing_data.copy()
    lag_mkt_series_adjusted['Appointment_Month'] += pd.DateOffset(months=correction['lag_adjusted'])


    forecasted_correction_output = best_correction_forecasted[best_correction_forecasted['SWT_Type'] == 'Predicted'].merge(
        lag_mkt_series_forecasted[['Appointment_Month', 'rgu_values']],
        on='Appointment_Month',
        how='inner'
    )
    
    adjusted_correction_output = best_correction_adjusted[best_correction_adjusted['SWT_Type'] == 'Predicted'].merge(
        lag_mkt_series_adjusted[['Appointment_Month', 'rgu_values']],
        on='Appointment_Month',
        how='inner'
    )

    product_output_data = forecasted_correction_output.merge(
        adjusted_correction_output,
        on='Appointment_Month',
        how='left',
        suffixes=('_forecasted', '_adjusted')
    )

    product_output_data['Forecast_Date'] = FORECAST_DATE
    product_output_data['Appointment_Month'] =  pd.to_datetime(product_output_data['Appointment_Month'])
    product_output_data['Product'] = product
    product_output_data['SWT_Forecasted_RGU'] = product_output_data['SWT_forecasted']
    product_output_data['SWT_Adjusted_RGU'] = product_output_data['SWT_adjusted']
    product_output_data['Alpha_Correction_Forecasted'] = correction['alpha_forecasted']
    product_output_data['Lag_Correction_Forecasted'] = correction['lag_forecasted']
    product_output_data['Alpha_Correction_Adjusted'] = correction['alpha_adjusted']
    product_output_data['Lag_Correction_Adjusted'] = correction['lag_adjusted']
   
    product_output_data = product_output_data[['Forecast_Date', 'Appointment_Month', 'Product',
    'SWT_Forecasted_RGU', 'SWT_Adjusted_RGU', 'Alpha_Correction_Forecasted', 'Lag_Correction_Forecasted', 
    'Alpha_Correction_Adjusted','Lag_Correction_Adjusted']]

    print(correction)
    # job = client.load_table_from_dataframe(
    #     product_output_data, 
    #     f"{PROJECT_ID}.{DATASET}.{OUTPUT_MARKETING_TABLE}", 
    #     job_config=job_config
    # )
    
    # job.result()


{'Product': 'PRIVATE LINE', 'lag_forecasted': 1, 'alpha_forecasted': np.float64(0.30000000000000004), 'lag_adjusted': 1, 'alpha_adjusted': np.float64(0.30000000000000004)}
{'Product': 'SECURITY', 'lag_forecasted': 1, 'alpha_forecasted': np.float64(0.9), 'lag_adjusted': 1, 'alpha_adjusted': np.float64(0.9)}
{'Product': 'IPTV', 'lag_forecasted': 1, 'alpha_forecasted': np.float64(0.9), 'lag_adjusted': 1, 'alpha_adjusted': np.float64(0.9)}
{'Product': 'HSIA', 'lag_forecasted': 1, 'alpha_forecasted': np.float64(0.9), 'lag_adjusted': 1, 'alpha_adjusted': np.float64(0.9)}
{'Product': 'BUSINESS INTERNET', 'lag_forecasted': 1, 'alpha_forecasted': np.float64(0.9), 'lag_adjusted': 1, 'alpha_adjusted': np.float64(0.9)}
{'Product': 'BUSINESS CONNECT', 'lag_forecasted': 1, 'alpha_forecasted': np.float64(0.9), 'lag_adjusted': 1, 'alpha_adjusted': np.float64(0.4)}
{'Product': 'WAN L2_L3', 'lag_forecasted': 1, 'alpha_forecasted': np.float64(0.4), 'lag_adjusted': 1, 'alpha_adjusted': np.float64(0.4)}
{'