In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import json

# Step 1: Data Cleaning
def clean_data(df_json):
    df = pd.read_json(df_json)
    df = df.dropna()

    print("Data cleaning complete: missing values and duplicates removed.")

    json_data_cleaned = df.to_json(orient='records', lines=False)
    return json_data_cleaned

# Step 2: Feature Engineering
def engineer_features(df_json):
    df = pd.read_json(df_json)
    window_size = 6
    df['tempF_rolling_mean'] = df['tempF'].rolling(window=window_size).mean()
    df['tempF_rolling_std'] = df['tempF'].rolling(window=window_size).std()
    df['windspeedMiles_rolling_mean'] = df['windspeedMiles'].rolling(window=window_size).mean()
    df['windspeedMiles_rolling_std'] = df['windspeedMiles'].rolling(window=window_size).std()
    df['humidity_rolling_mean'] = df['humidity'].rolling(window=window_size).mean()
    df['humidity_rolling_std'] = df['humidity'].rolling(window=window_size).std()
    
    for lag in [2, 4, 6]:
        df[f'tempF_lag_{lag}'] = df['tempF'].shift(lag)
        df[f'windspeedMiles_lag_{lag}'] = df['windspeedMiles'].shift(lag)
        df[f'humidity_lag_{lag}'] = df['humidity'].shift(lag)
    
    df.dropna(inplace=True)
    print("Feature engineering complete: rolling and lag features added.")
    json_data = df.to_json(orient='records', lines=False)
    return json_data

# Step 3: Add Cyclic Features
def add_cyclic_features(df_json):
    df = pd.read_json(df_json)

    df['datetime_new'] = pd.to_datetime(df['datetime'])
    df['month'] = df['datetime_new'].dt.month
    df['month_sin'] = np.round(np.sin(2 * np.pi * df['month'] / 12), decimals=6)
    df['month_cos'] = np.round(np.cos(2 * np.pi * df['month'] / 12), decimals=6)
    df.drop(columns=['month'], inplace=True)
    print("Cyclic features added for month seasonality.")
    json_data = df.to_json(orient='records', lines=False)
    return json_data

# Step 4: Normalize and Encode Data
def normalize_and_encode(df_json):
    min_max_map = {}
    
    df = pd.read_json(df_json)
    
    # Select columns to normalize (all numeric columns except 'month_sin', 'month_cos')
    columns_to_normalize = df.select_dtypes(include=[np.number]).columns.difference(['month_sin', 'month_cos'])
    
    # Loop through each column to calculate min and max and store it in the map
    for col in columns_to_normalize:
        min_value = float(df[col].min())
        max_value = float(df[col].max())
        min_max_map[col] = {"min": min_value, "max": max_value}
        
        # Normalize the column using min-max scaling
        df[col] = (df[col] - min_value) / (max_value - min_value)

    df['month_cos'] = (df['month_cos'] + 1) / 2
    df['month_sin'] = (df['month_sin'] + 1) / 2

    min_max_json = json.dumps(min_max_map)
    with open('min_max_map.json', 'w') as f:
        f.write(min_max_json)        

    for col in df.select_dtypes(include=['object']).columns:
        if col != 'datetime':
            df[col] = df[col].astype(str)
            # le = LabelEncoder()
            # df[col] = le.fit_transform(df[col])

    print("Data normalization and encoding complete.")
    json_data = df.to_json(orient='records', lines=False)
    return json_data

# Step 5: Feature Selection
def select_final_features(df_json):
    df = pd.read_json(df_json)

    selected_features = [
        'datetime', 'precipMM', 'weatherCode', 'visibility', 'HeatIndexF', 'WindChillF',
        'windspeedMiles', 'FeelsLikeF', 'tempF_rolling_mean', 'windspeedMiles_rolling_mean',
        'humidity_rolling_mean', 'value', 'pressure', 'pressureInches', 'cloudcover', 'uvIndex',
        'tempF_rolling_std', 'windspeedMiles_rolling_std', 'humidity_rolling_std',
        'tempF_lag_2', 'windspeedMiles_lag_2', 'humidity_lag_2',
        'tempF_lag_4', 'windspeedMiles_lag_4', 'humidity_lag_4',
        'tempF_lag_6', 'windspeedMiles_lag_6', 'humidity_lag_6',
        'month_sin', 'month_cos', 'subba-name', 'zone'
    ]
    df_selected = df[selected_features]
    print("Feature selection complete: selected features retained.")

    json_data_selected = df_selected.to_json(orient='records', lines=False)
    return json_data_selected

In [6]:
df = pd.read_csv("data_raw.csv")
df['datetime'] = df['datetime'].astype(str)
df['zone'] = df['zone'].astype(str)
df['subba-name'] = df['subba-name'].astype(str)
df_json =  df.to_json(orient='records', lines=False)

  df = pd.read_csv("data_raw.csv")


In [7]:
df_json = clean_data(df_json)
df_json = engineer_features(df_json)
df_json = add_cyclic_features(df_json)
df_json = normalize_and_encode(df_json)
df_json = select_final_features(df_json)
df = pd.read_json(df_json)
df.head()



Data cleaning complete: missing values and duplicates removed.
Feature engineering complete: rolling and lag features added.
Cyclic features added for month seasonality.
Data normalization and encoding complete.
Feature selection complete: selected features retained.


Unnamed: 0,datetime,precipMM,weatherCode,visibility,HeatIndexF,WindChillF,windspeedMiles,FeelsLikeF,tempF_rolling_mean,windspeedMiles_rolling_mean,...,tempF_lag_4,windspeedMiles_lag_4,humidity_lag_4,tempF_lag_6,windspeedMiles_lag_6,humidity_lag_6,month_sin,month_cos,subba-name,zone
0,2019-01-01 00:00:00,0.052731,0.861702,1.0,0.402878,0.457516,0.2,0.4375,0.422594,0.327044,...,0.462121,0.25,0.916667,0.416667,0.275,0.885417,0.75,0.933012,ISNE - New Hampshire,4002
1,2019-01-01 00:00:00,0.05838,0.861702,0.2,0.42446,0.477124,0.3,0.45625,0.422594,0.345912,...,0.484848,0.25,0.9375,0.44697,0.225,0.96875,0.75,0.933012,ISNE - Northeast Mass.,4008
2,2019-01-01 01:00:00,0.050847,0.861702,0.6,0.460432,0.522876,0.225,0.5,0.426778,0.339623,...,0.477273,0.2,0.958333,0.462121,0.25,0.916667,0.75,0.933012,ISNE - Connecticut,4004
3,2019-01-01 01:00:00,0.06403,0.861702,0.6,0.467626,0.529412,0.275,0.50625,0.428173,0.345912,...,0.401515,0.275,0.833333,0.484848,0.25,0.9375,0.75,0.933012,ISNE - Rhode Island,4005
4,2019-01-01 01:00:00,0.054614,0.861702,0.9,0.402878,0.464052,0.2,0.44375,0.41841,0.345912,...,0.424242,0.2,0.958333,0.477273,0.2,0.958333,0.75,0.933012,ISNE - New Hampshire,4002


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1116561 entries, 0 to 1116560
Data columns (total 32 columns):
 #   Column                       Non-Null Count    Dtype         
---  ------                       --------------    -----         
 0   datetime                     1116561 non-null  datetime64[ns]
 1   precipMM                     1116561 non-null  float64       
 2   weatherCode                  1116561 non-null  float64       
 3   visibility                   1116561 non-null  float64       
 4   HeatIndexF                   1116561 non-null  float64       
 5   WindChillF                   1116561 non-null  float64       
 6   windspeedMiles               1116561 non-null  float64       
 7   FeelsLikeF                   1116561 non-null  float64       
 8   tempF_rolling_mean           1116561 non-null  float64       
 9   windspeedMiles_rolling_mean  1116561 non-null  float64       
 10  humidity_rolling_mean        1116561 non-null  float64       
 11  value      

In [5]:
df.to_csv("data_preprocess.csv", index=False)