In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [39]:
# Load the dataset
path="/content/drive/MyDrive/Dataset/dataset_updated_preliminary.csv"
data = pd.read_csv(path)

# Display the first few rows of the dataset
print(data.head())


             timestamp    value  month  day  hour  minute day_of_week  \
0  2014-07-01 00:00:00  10844.0      7    1     0       0     Tuesday   
1  2014-07-01 00:30:00   8127.0      7    1     0      30     Tuesday   
2  2014-07-01 01:00:00   6210.0      7    1     1       0     Tuesday   
3  2014-07-01 01:30:00   4656.0      7    1     1      30     Tuesday   
4  2014-07-01 02:00:00   3820.0      7    1     2       0     Tuesday   

   isWeekend  isAnomaly Holiday  prep  snow  snow_depth  min_temp  max_temp  \
0      False      False     NaN   0.0   0.0         0.0      72.0      89.0   
1      False      False     NaN   NaN   NaN         NaN       NaN       NaN   
2      False      False     NaN   NaN   NaN         NaN       NaN       NaN   
3      False      False     NaN   NaN   NaN         NaN       NaN       NaN   
4      False      False     NaN   NaN   NaN         NaN       NaN       NaN   

   isHoliday  TotalMinutes  Season  
0      False             0  Summer  
1      False

In [40]:
#Remove the 'isAnomaly' column
data.drop(columns=['isAnomaly'], inplace=True)

#Filter out the data for only September and above (Summer not included)
#data = data[data['Season'] != 'Summer']

# Fill missing values using forward filling
data.fillna(method='ffill', inplace=True)

# Fill any remaining missing values using linear interpolation
data.interpolate(method='linear', inplace=True)

# Fill missing value in Holiday column
data['Holiday'].fillna("No Public Holiday", inplace=True)






  data.fillna(method='ffill', inplace=True)
  data.interpolate(method='linear', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Holiday'].fillna("No Public Holiday", inplace=True)


In [41]:

# Create cyclic features for hour and day of the week
data['hour_sin'] = np.sin(2 * np.pi * data['hour'] / 24)
data['hour_cos'] = np.cos(2 * np.pi * data['hour'] / 24)
data['day_of_week_sin'] = np.sin(2 * np.pi * data['day_of_week'].apply(lambda x: ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'].index(x)) / 7)
data['day_of_week_cos'] = np.cos(2 * np.pi * data['day_of_week'].apply(lambda x: ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'].index(x)) / 7)

# Create lag features (e.g., lag of value for the last 1, 2, and 3 periods)
for lag in range(1, 4):
    data[f'value_lag_{lag}'] = data['value'].shift(lag)

# Combine hour and minute into a single feature (Total Minutes)
data['TotalMinutes'] = data['hour'] * 60 + data['minute']

# Fill in the missing value in value_lag columns with 0
data['value_lag_1'].fillna(0, inplace=True)
data['value_lag_2'].fillna(0, inplace=True)
data['value_lag_3'].fillna(0, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['value_lag_1'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['value_lag_2'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alway

In [42]:
# Select features for modeling (excluding original timestamp and non-numeric columns)
features = ['value','month','day','hour','minute', 'snow', 'snow_depth', 'min_temp', 'max_temp',
            'hour_sin', 'hour_cos',
            'day_of_week_sin', 'day_of_week_cos',
            'TotalMinutes']

X = data[features]
y = data['value']  # Assuming we want to predict the value.


In [43]:
#from google.colab import drive
#drive.mount('/content/drive')

In [44]:
# Define a function for time-based splitting
def time_based_split(data, train_size=0.7, val_size=0.15):
    total_size = len(data)
    train_end = int(total_size * train_size)
    val_end = train_end + int(total_size * val_size)

    X_train = X.iloc[:train_end]
    y_train = y.iloc[:train_end]

    X_val = X.iloc[train_end:val_end]
    y_val = y.iloc[train_end:val_end]

    X_test = X.iloc[val_end:]
    y_test = y.iloc[val_end:]

    return X_train, X_val, X_test, y_train, y_val, y_test

X_train, X_val, X_test, y_train, y_val, y_test = time_based_split(data)


In [45]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fit on training data only
X_train_scaled = scaler.fit_transform(X_train)

# Transform validation and test sets
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Convert scaled arrays back to DataFrames for better visualization
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_val_scaled_df = pd.DataFrame(X_val_scaled, columns=X_val.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)



In [46]:


#data = pd.concat([X_train_scaled_df], axis=1)



# Save the prepared dataset
data.to_csv("/content/drive/MyDrive/Dataset/Preprocessed_preliminary.csv")

data1 = pd.read_csv("/content/drive/MyDrive/Dataset/Preprocessed_preliminary.csv")

# Drop unnecessary columns
if 'Unnamed: 0' in data1.columns:
    data1 = data1.drop(columns=['Unnamed: 0'])
print(data1.head())

#print()
#print("Scaled Training Data:")
#print(X_train_scaled_df.head())



             timestamp    value  month  day  hour  minute day_of_week  \
0  2014-07-01 00:00:00  10844.0      7    1     0       0     Tuesday   
1  2014-07-01 00:30:00   8127.0      7    1     0      30     Tuesday   
2  2014-07-01 01:00:00   6210.0      7    1     1       0     Tuesday   
3  2014-07-01 01:30:00   4656.0      7    1     1      30     Tuesday   
4  2014-07-01 02:00:00   3820.0      7    1     2       0     Tuesday   

   isWeekend            Holiday  prep  ...  isHoliday  TotalMinutes  Season  \
0      False  No Public Holiday   0.0  ...      False             0  Summer   
1      False  No Public Holiday   0.0  ...      False            30  Summer   
2      False  No Public Holiday   0.0  ...      False            60  Summer   
3      False  No Public Holiday   0.0  ...      False            90  Summer   
4      False  No Public Holiday   0.0  ...      False           120  Summer   

   hour_sin  hour_cos  day_of_week_sin day_of_week_cos  value_lag_1  \
0  0.000000  1.