In [None]:
# Step 1. Import necessary modules

# pandas is used for reading, grouping, and aggregating data
import pandas as pd

# numpy is used for numerical operations
import numpy as np

# gradient boosting model for forecasting
from sklearn.ensemble import GradientBoostingRegressor

# RMSE calculation (used for validation)
from sklearn.metrics import root_mean_squared_error


# Step 2. Load input file and output template

# input file created from Case Study #1 parsing
INPUT_FILE = "1CS2_HistoryData.csv"

# output template provided by instructor
OUTPUT_TEMPLATE = "1CS2-ExampleOutput.csv"

# read the parsed transaction-level data
df = pd.read_csv(INPUT_FILE)

# convert Date column to datetime for time-series operations
df['Date'] = pd.to_datetime(df['Date'])

# Step 3. Aggregate transaction-level data to daily totals

# group by Date and sum Amounts to get daily transaction volume
daily_df = (
    df.groupby('Date')['Amount']
    .sum()
    .reset_index()
)

# rename column to be explicit
daily_df.rename(columns={'Amount': 'Transactions'}, inplace=True)

# sort by date to preserve time order
daily_df = daily_df.sort_values('Date')

# set Date as index for forecasting
daily_df.set_index('Date', inplace=True)


# Step 4. Train / test split (time-based)

# determine last available date in the dataset
last_date = daily_df.index.max()

# use final 20% of data as test set
split_date = daily_df.index[int(len(daily_df) * 0.8)]

# training data (first 80%)
train = daily_df[daily_df.index < split_date]

# testing data (last 20%)
test = daily_df[daily_df.index >= split_date]

print("Training rows:", len(train))
print("Testing rows:", len(test))


# Step 5. Baseline (naive) model

# naive forecast: previous day's value
naive_predictions = test['Transactions'].shift(1)

# drop rows where prediction is missing
valid_idx = naive_predictions.notna()

# calculate RMSE only if data exists
if valid_idx.sum() > 0:
    naive_rmse = root_mean_squared_error(
        test.loc[valid_idx, 'Transactions'],
        naive_predictions.loc[valid_idx]
    )
    print("Naive RMSE:", naive_rmse)
else:
    print("Naive RMSE could not be calculated (insufficient test data)")



# Step 6. Feature engineering function

def create_features(data):
    """
    Creates lag, rolling average, and calendar features
    to capture transaction time patterns.
    """

    df_feat = data.copy()

    # lagged transaction values
    df_feat['lag_1'] = df_feat['Transactions'].shift(1)
    df_feat['lag_7'] = df_feat['Transactions'].shift(7)
    df_feat['lag_14'] = df_feat['Transactions'].shift(14)

    # rolling averages smooth noise
    df_feat['rolling_7'] = df_feat['Transactions'].rolling(7).mean()
    df_feat['rolling_14'] = df_feat['Transactions'].rolling(14).mean()

    # calendar-based features
    df_feat['day_of_week'] = df_feat.index.dayofweek
    df_feat['month'] = df_feat.index.month

    return df_feat


# Step 7. Prepare data for machine learning

# apply feature engineering
feature_df = create_features(daily_df)

# drop rows with missing values caused by lagging
feature_df = feature_df.dropna()

# determine split date again after feature engineering
split_date_feat = feature_df.index[int(len(feature_df) * 0.8)]

# split engineered data
train_feat = feature_df[feature_df.index < split_date_feat]
test_feat = feature_df[feature_df.index >= split_date_feat]

# separate predictors and target
X_train = train_feat.drop('Transactions', axis=1)
y_train = train_feat['Transactions']

X_test = test_feat.drop('Transactions', axis=1)
y_test = test_feat['Transactions']



# Step 8. Train Gradient Boosting model

model = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=3,
    random_state=27
)

# train model
model.fit(X_train, y_train)

# generate predictions for test set
gb_predictions = model.predict(X_test)

# calculate RMSE
gb_rmse = root_mean_squared_error(
    y_test,
    gb_predictions
)

print("Gradient Boosting RMSE:", gb_rmse)


# Step 9. Train final model on ALL historical data

X_full = feature_df.drop('Transactions', axis=1)
y_full = feature_df['Transactions']

final_model = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=3,
    random_state=27
)

final_model.fit(X_full, y_full)

# Step 10. Generate future forecast dates

# define forecast start date (day after last historical date)
forecast_start = daily_df.index.max() + pd.Timedelta(days=1)

# number of days to forecast (must match assignment requirements)
FORECAST_DAYS = 90   # change if instructor specifies different length

# generate a date range for the forecast horizon
forecast_dates = pd.date_range(
    start=forecast_start,
    periods=FORECAST_DAYS,
    freq='D'
)

# copy historical data so lag features work correctly
history = daily_df.copy()

# list to store predicted values
future_predictions = []

# loop through each forecast date
for date in forecast_dates:

    # create features using the most recent data
    latest_features = create_features(history).iloc[-1:]
    X_future = latest_features.drop('Transactions', axis=1)

    # predict next day's transactions
    prediction = final_model.predict(X_future)[0]
    future_predictions.append(prediction)

    # append prediction to history for next iteration
    history.loc[date] = prediction


# Step 11. Write final output file

# define group name exactly as required
GROUP_NAME = "Group 1"

# build output dataframe in required column order
output_df = pd.DataFrame({
    "Group": GROUP_NAME,
    "Date": forecast_dates.strftime("%Y-%m-%d"),
    "Total Predicted Amount": future_predictions
})

# save output file
output_df.to_csv("final_forecast_output.csv", index=False)

