In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Step 1: Data Cleaning
def clean_data(df_json):
    df = pd.read_json(df_json)
    df = df.dropna()

    print("Data cleaning complete: missing values and duplicates removed.")

    json_data_cleaned = df.to_json(orient='records', lines=False)
    return json_data_cleaned

# Step 2: Feature Engineering
def engineer_features(df_json):
    df = pd.read_json(df_json)
    window_size = 6
    df['tempF_rolling_mean'] = df['tempF'].rolling(window=window_size).mean()
    df['tempF_rolling_std'] = df['tempF'].rolling(window=window_size).std()
    df['windspeedMiles_rolling_mean'] = df['windspeedMiles'].rolling(window=window_size).mean()
    df['windspeedMiles_rolling_std'] = df['windspeedMiles'].rolling(window=window_size).std()
    df['humidity_rolling_mean'] = df['humidity'].rolling(window=window_size).mean()
    df['humidity_rolling_std'] = df['humidity'].rolling(window=window_size).std()
    
    for lag in [2, 4, 6]:
        df[f'tempF_lag_{lag}'] = df['tempF'].shift(lag)
        df[f'windspeedMiles_lag_{lag}'] = df['windspeedMiles'].shift(lag)
        df[f'humidity_lag_{lag}'] = df['humidity'].shift(lag)
    
    df.dropna(inplace=True)
    print("Feature engineering complete: rolling and lag features added.")
    json_data = df.to_json(orient='records', lines=False)
    return json_data

# Step 3: Add Cyclic Features
def add_cyclic_features(df_json):
    df = pd.read_json(df_json)

    df['datetime_new'] = pd.to_datetime(df['datetime'])
    df['month'] = df['datetime_new'].dt.month
    df['month_sin'] = np.round(np.sin(2 * np.pi * df['month'] / 12), decimals=6)
    df['month_cos'] = np.round(np.cos(2 * np.pi * df['month'] / 12), decimals=6)
    df.drop(columns=['month'], inplace=True)
    print("Cyclic features added for month seasonality.")
    json_data = df.to_json(orient='records', lines=False)
    return json_data

# Step 4: Normalize and Encode Data
def normalize_and_encode(df_json):
    df = pd.read_json(df_json)
    
    columns_to_normalize = df.select_dtypes(include=[np.number]).columns.difference(['month_sin', 'month_cos'])
    df[columns_to_normalize] = df[columns_to_normalize].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

    df['month_cos'] = (df['month_cos'] + 1) / 2
    df['month_sin'] = (df['month_sin'] + 1) / 2

    for col in df.select_dtypes(include=['object']).columns:
        if col != 'datetime':
            df[col] = df[col].astype(str)
            # le = LabelEncoder()
            # df[col] = le.fit_transform(df[col])

    print("Data normalization and encoding complete.")
    json_data = df.to_json(orient='records', lines=False)
    return json_data

# Step 5: Feature Selection
def select_final_features(df_json):
    df = pd.read_json(df_json)

    selected_features = [
        'datetime', 'precipMM', 'weatherCode', 'visibility', 'HeatIndexF', 'WindChillF',
        'windspeedMiles', 'FeelsLikeF', 'tempF_rolling_mean', 'windspeedMiles_rolling_mean',
        'humidity_rolling_mean', 'value', 'pressure', 'pressureInches', 'cloudcover', 'uvIndex',
        'tempF_rolling_std', 'windspeedMiles_rolling_std', 'humidity_rolling_std',
        'tempF_lag_2', 'windspeedMiles_lag_2', 'humidity_lag_2',
        'tempF_lag_4', 'windspeedMiles_lag_4', 'humidity_lag_4',
        'tempF_lag_6', 'windspeedMiles_lag_6', 'humidity_lag_6',
        'month_sin', 'month_cos', 'subba-name', 'zone'
    ]
    df_selected = df[selected_features]
    print("Feature selection complete: selected features retained.")

    json_data_selected = df_selected.to_json(orient='records', lines=False)
    return json_data_selected

In [3]:
df = pd.read_csv("all_merged_zones_weather_demand_data.csv")
df['datetime'] = df['datetime'].astype(str)
df['zone'] = df['zone'].astype(str)
df['subba-name'] = df['subba-name'].astype(str)
df_json =  df.to_json(orient='records', lines=False)

  df = pd.read_csv("all_merged_zones_weather_demand_data.csv")


In [4]:
df_json = clean_data(df_json)
df_json = engineer_features(df_json)
df_json = add_cyclic_features(df_json)
df_json = normalize_and_encode(df_json)
df_json = select_final_features(df_json)
df = pd.read_json(df_json)
df.head()



Data cleaning complete: missing values and duplicates removed.
Feature engineering complete: rolling and lag features added.
Cyclic features added for month seasonality.


Unnamed: 0,datetime,tempF,windspeedMiles,weatherCode,precipMM,precipInches,humidity,visibility,visibilityMiles,pressure,...,humidity_lag_2,tempF_lag_4,windspeedMiles_lag_4,humidity_lag_4,tempF_lag_6,windspeedMiles_lag_6,humidity_lag_6,datetime_new,month_sin,month_cos
0,2019-01-01 00:00:00,37,8,356,2.8,0.1,96,10,6,1012,...,96,42,10,92,36,11,89,1546300800000,0.5,0.866025
1,2019-01-01 00:00:00,40,12,356,3.1,0.1,95,2,1,1013,...,84,45,10,94,40,9,97,1546300800000,0.5,0.866025
2,2019-01-01 01:00:00,45,9,356,2.7,0.1,96,6,3,1008,...,96,44,8,96,42,10,92,1546304400000,0.5,0.866025
3,2019-01-01 01:00:00,46,11,356,3.4,0.1,94,6,3,1009,...,95,34,11,84,45,10,94,1546304400000,0.5,0.866025
4,2019-01-01 01:00:00,37,8,356,2.9,0.1,97,9,5,1009,...,96,37,8,96,44,8,96,1546304400000,0.5,0.866025


In [8]:
df.to_csv("data_preprocess.csv", index=False)