In [12]:
# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error
from openai import OpenAI
import os
from dotenv import load_dotenv

# Load API key
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Time-series models
from statsmodels.tsa.statespace.sarimax import SARIMAX
from prophet import Prophet  # optional
from xgboost import XGBRegressor  # optional


# Importing Data

In [3]:
import xml.etree.ElementTree as ET
import pandas as pd

#CPS historic data is only available as a part of an XML file

# Load and parse the XML file

def parse_cps_xml(path):
    xml_file_path = path
    tree = ET.parse(xml_file_path)
    root = tree.getroot()

    # Extract interval readings
    data_rows = []
    for reading in root.iter():
        if reading.tag.lower().endswith("intervalreading"):
            row_data = {}
            for elem in reading.iter():
                row_data[elem.tag.split('}')[-1]] = elem.text
            data_rows.append(row_data)

    # Convert to DataFrame
    df = pd.DataFrame(data_rows)

    # Convert Unix timestamp to datetime
    df['start'] = pd.to_datetime(df['start'].astype(int), unit='s')

    return df

cps1=parse_cps_xml("/Users/lukeofthehill/repos/silly-things/CPS_Electric_15_Minute_03-31-2025_07-26-2025_20250726154824606_6601803.xml")
cps2=parse_cps_xml("/Users/lukeofthehill/repos/silly-things/CPS_Electric_15_Minute_04-05-2024_10-03-2024_20250726154734293_6601803.xml")
cps3=parse_cps_xml("/Users/lukeofthehill/repos/silly-things/CPS_Electric_15_Minute_04-11-2023_10-09-2023_20250726154546331_6601803.xml")
cps4=parse_cps_xml("/Users/lukeofthehill/repos/silly-things/CPS_Electric_15_Minute_10-02-2024_04-01-2025_20250726154807176_6601803.xml")
cps5=parse_cps_xml("/Users/lukeofthehill/repos/silly-things/CPS_Electric_15_Minute_10-08-2023_04-06-2024_20250726154700270_6601803.xml")
cps6=parse_cps_xml("/Users/lukeofthehill/repos/silly-things/CPS_Electric_15_Minute_10-13-2022_04-12-2023_20250726154008166_6601803.xml")

cps=pd.concat([cps1,cps2,cps3,cps4,cps5,cps6])
cps=cps[['start','value']]
cps=cps.rename(columns={'start':'date','value':'amount'})
cps['amount']=cps['amount'].astype(float)
cps['date']=pd.to_datetime(cps['date']).dt.date
cps=cps.groupby('date')['amount'].sum().reset_index()
cps['kwh'] = cps['amount'].astype(float) / 1000
cps.head()

Unnamed: 0,date,amount,kwh
0,2022-10-14,30755.0,30.755
1,2022-10-15,43172.0,43.172
2,2022-10-16,17759.0,17.759
3,2022-10-17,7545.0,7.545
4,2022-10-18,6611.0,6.611


In [4]:
# Importing Data
temp=pd.read_csv("/Users/lukeofthehill/repos/silly-things/US Weather Data.csv")
temp=temp[temp['zipcode']==78232] # Keeping only my home county
temp['date'] = pd.to_datetime(temp['date'], format='%Y%m%d').dt.date # Converting to an actual
temp.head()

Unnamed: 0,st_abb,st_code,county_name,fips,zipcode,date,stability,tmin,tmax,tavg
40331,TX,48,Bexar,48029,78232,2022-01-01,stable,18.881,27.319,23.1
40332,TX,48,Bexar,48029,78232,2022-01-02,stable,-0.528,25.946,12.709
40333,TX,48,Bexar,48029,78232,2022-01-03,stable,-3.637,8.6,2.481
40334,TX,48,Bexar,48029,78232,2022-01-04,stable,-3.66,12.669,4.504
40335,TX,48,Bexar,48029,78232,2022-01-05,stable,2.056,18.852,10.454


In [5]:
# Home maintenance tasks
hw=pd.DataFrame({'task':["Electrical Panel Replacement",
                            'Dryer Outlet Replacement',
                            "Battery Panel Installation",
                            "AC Maintenance: Replace Temperature Sensor",
                            "Reinsulation",
                            "AC Maintenance: New Fan",
                            "Fence Replacement / Mulch",
                            "Replaced Windows",
                            "AC Maintenance: Hard Start Kit with/without Potential Relay",
                            "New Water Heater",
                            "AC Condensation Line Clog Work"],
                'date':['2022-11-04',
                    '2022-11-15',
                    '2023-09-22',
                    '2023-12-04',
                    '2024-02-28',
                    '2024-03-28',
                    '2025-04-01',
                    '2025-04-25',
                    '2025-04-29',
                    '2025-05-20',
                    '2025-07-11']})
hw['date']=pd.to_datetime(hw['date'],format='%Y-%m-%d').dt.date
hw.head()


Unnamed: 0,task,date
0,Electrical Panel Replacement,2022-11-04
1,Dryer Outlet Replacement,2022-11-15
2,Battery Panel Installation,2023-09-22
3,AC Maintenance: Replace Temperature Sensor,2023-12-04
4,Reinsulation,2024-02-28


In [6]:
# Combinind the data
df=pd.merge(cps,temp,'left', on='date')
df=pd.merge(df,hw,'left',on='date')
df['task'].fillna('No Work',inplace=True)
dummies=pd.get_dummies(df['task'],prefix='task')
dummies.columns = dummies.columns.str.replace(' ', '')
dummies.columns = dummies.columns.str.replace(':', '')
df = pd.concat([df, dummies], axis=1)
df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['task'].fillna('No Work',inplace=True)


Unnamed: 0,date,amount,kwh,st_abb,st_code,county_name,fips,zipcode,stability,tmin,...,task_ACMaintenanceNewFan,task_ACMaintenanceReplaceTemperatureSensor,task_BatteryPanelInstallation,task_DryerOutletReplacement,task_ElectricalPanelReplacement,task_FenceReplacement/Mulch,task_NewWaterHeater,task_NoWork,task_Reinsulation,task_ReplacedWindows
0,2022-10-14,30755.0,30.755,TX,48.0,Bexar,48029.0,78232.0,stable,20.021,...,False,False,False,False,False,False,False,True,False,False
1,2022-10-15,43172.0,43.172,TX,48.0,Bexar,48029.0,78232.0,stable,20.702,...,False,False,False,False,False,False,False,True,False,False
2,2022-10-16,17759.0,17.759,TX,48.0,Bexar,48029.0,78232.0,stable,22.647,...,False,False,False,False,False,False,False,True,False,False
3,2022-10-17,7545.0,7.545,TX,48.0,Bexar,48029.0,78232.0,stable,18.304,...,False,False,False,False,False,False,False,True,False,False
4,2022-10-18,6611.0,6.611,TX,48.0,Bexar,48029.0,78232.0,stable,14.17,...,False,False,False,False,False,False,False,True,False,False


In [7]:
from datetime import date
import numpy as np
# df.fillna(0,inplace=True)
df=df[df['date']<date(2025, 7, 1)]

after_dt=None
def recode_task_sw(var):
    after_dt=df[df[var]==True]['date'].iloc[0]
    df[var]=np.where(df['date']>=after_dt,1,0)
recode_task_sw('task_ACMaintenanceHardStartKitwith/withoutPotentialRelay')
recode_task_sw('task_ACMaintenanceNewFan')
recode_task_sw('task_ACMaintenanceReplaceTemperatureSensor')
recode_task_sw('task_BatteryPanelInstallation')
recode_task_sw('task_Reinsulation')
recode_task_sw('task_ReplacedWindows')

df=df[['date','kwh','tmin', 'tmax', 'tavg',
       'task_ACMaintenanceHardStartKitwith/withoutPotentialRelay',
       'task_ACMaintenanceNewFan',
       'task_ACMaintenanceReplaceTemperatureSensor',
       'task_BatteryPanelInstallation',
       'task_Reinsulation',
       'task_ReplacedWindows']]

pre = df[df['date'] < date(2025, 1, 1)]
post = df[df['date'] >= date(2025, 1, 1)]



In [8]:
use_case = {
    "target": "kwh",
    "frequency": "daily",
    "forecast_horizon": 30,  # 30 days ahead
    "exog_features": ["tmax", "tmin"],
    "constraints": {"interpretability": True}
}

y = df[use_case["target"]]
exog = df[use_case["exog_features"]] if use_case["exog_features"] else None

In [21]:
def get_model_suggestions(use_case, sample_data):
    prompt = f"""
    Given the following use case: {use_case}
    and a sample of the data: {sample_data.head(10).to_dict()},
    suggest 2-3 forecasting model approaches (SARIMAX, Prophet, or ML-based).
    Provide Python code for each.
    """
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a time-series modeling assistant."},
            {"role": "user", "content": prompt}
        ]
    )

    return response.choices[0].message.content

model_suggestions = get_model_suggestions(use_case, df)
print(model_suggestions)

Given the use case with a target variable of "kwh", daily frequency, a 30-day forecast horizon, exogenous features ('tmax', 'tmin'), and an emphasis on interpretability, I suggest the following forecasting approaches:

1. **SARIMAX (Seasonal Autoregressive Integrated Moving Average with Exogenous Variables)**: This model is straightforward to interpret, handles seasonality, and can include exogenous variables.

2. **Facebook Prophet**: Designed for daily data with holidays or seasonality effects, easy to use, and offers components decomposition for interpretability.

3. **Random Forest Regressor**: A tree-based machine learning approach which, despite being less interpretable than the above, offers variable importance metrics to gauge the influence of predictors.

Here is Python code for each of these approaches:

### 1. SARIMAX
```python
import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Assuming `data` is your DataFrame and has been preprocessed

# Define m

In [None]:
results = []

def evaluate_model(y_true, y_pred, model_name):
    return {
        "model": model_name,
        "MAE": mean_absolute_error(y_true, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_true, y_pred))
    }
 
# Example: Run SARIMAX (initial baseline)
train = y[:-use_case["forecast_horizon"]]
test = y[-use_case["forecast_horizon"]:]

sarimax = SARIMAX(train, order=(1,1,1), seasonal_order=(1,1,1,52), exog=exog[:-use_case["forecast_horizon"]])
sarimax_fit = sarimax.fit()
y_pred = sarimax_fit.forecast(steps=use_case["forecast_horizon"], exog=exog[-use_case["forecast_horizon"]:])
results.append(evaluate_model(test, y_pred, "SARIMAX"))

In [23]:

def refine_with_llm(results):
    prompt = f"""
    Model results: {results}.
    Suggest refinements or new models (include Python code).
    """

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a time-series modeling assistant."},
            {"role": "user", "content": prompt}
        ]
    )

    return response.choices[0].message.content

refinements = refine_with_llm(results)
print(refinements)

To improve your forecasting results, you can consider several strategies, including refining the current SARIMAX model, trying different configurations, or exploring alternative models. Below are some suggestions along with Python code snippets to guide your exploration:

### 1. Refining the SARIMAX Model

- **Hyperparameter Tuning**: Tweak the order of the SARIMAX model (p, d, q) and seasonal order (P, D, Q, s) to find a better fit.
- **Model Diagnostics**: Check residual diagnostics to ensure there is no autocorrelation and that residuals are white noise.

```python
import itertools
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np


# Assume you have a time series data `y` and a seasonal order `s`
# Define the d and D parameters based on prior knowledge of the data or testing
d = 1
D = 1
s = 12  # Example for monthly data

# Define p, q, P, and Q ranges
p = q = P = Q = range(0, 3)

# Generate

In [24]:
import pandas as pd
pd.DataFrame(results).sort_values("RMSE")

Unnamed: 0,model,MAE,RMSE
0,SARIMAX,5.566508,6.561775
