In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [7]:
data = pd.read_csv("Datasets Folder\dataset_updated.csv")

# Display the first few rows of the dataset
data.head()


Unnamed: 0,timestamp,value,month,day,hour,minute,day_of_week,isWeekend,isAnomaly,Holiday,prep,snow,snow_depth,min_temp,max_temp,isHoliday,TotalMinutes,season
0,2014-07-01 00:00:00,10844.0,7,1,0,0,Tuesday,False,False,,0.0,0.0,0.0,72.0,89.0,False,0,Summer
1,2014-07-01 00:30:00,8127.0,7,1,0,30,Tuesday,False,False,,,,,,,False,30,Summer
2,2014-07-01 01:00:00,6210.0,7,1,1,0,Tuesday,False,False,,,,,,,False,60,Summer
3,2014-07-01 01:30:00,4656.0,7,1,1,30,Tuesday,False,False,,,,,,,False,90,Summer
4,2014-07-01 02:00:00,3820.0,7,1,2,0,Tuesday,False,False,,,,,,,False,120,Summer


In [8]:
#Remove the 'isAnomaly' column
data.drop(columns=['isAnomaly'], inplace=True)

#Filter out the data for only September and above (Summer not included)
#data = data[data['Season'] != 'Summer']

# Fill missing values using forward filling
data.ffill(inplace=True)


# Fill any remaining missing values using linear interpolation
data = data.infer_objects()  # Convert object dtypes to appropriate types
data.interpolate(method='linear', inplace=True)


# Fill missing value in Holiday column
data['Holiday'] = data['Holiday'].fillna("No Public Holiday")

  data.interpolate(method='linear', inplace=True)


In [9]:

# Create cyclic features for hour and day of the week
data['hour_sin'] = np.sin(2 * np.pi * data['hour'] / 24)
data['hour_cos'] = np.cos(2 * np.pi * data['hour'] / 24)
data['day_of_week_sin'] = np.sin(2 * np.pi * data['day_of_week'].apply(lambda x: ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'].index(x)) / 7)
data['day_of_week_cos'] = np.cos(2 * np.pi * data['day_of_week'].apply(lambda x: ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'].index(x)) / 7)

# Create lag features (e.g., lag of value for the last 1, 2, and 3 periods)
for lag in range(1, 4):
    data[f'value_lag_{lag}'] = data['value'].shift(lag)

# Combine hour and minute into a single feature (Total Minutes)
data['TotalMinutes'] = data['hour'] * 60 + data['minute']

# Fill in the missing value in value_lag columns with 0
data['value_lag_1'] = data['value_lag_1'].fillna(0)
data['value_lag_2'] = data['value_lag_2'].fillna(0)
data['value_lag_3'] = data['value_lag_3'].fillna(0)


In [10]:
# Select features for modeling (excluding original timestamp and non-numeric columns)
features = ['value','month','day','hour','minute', 'snow', 'snow_depth', 'min_temp', 'max_temp',
            'hour_sin', 'hour_cos',
            'day_of_week_sin', 'day_of_week_cos',
            'TotalMinutes']

X = data[features]
y = data['value']  # Assuming we want to predict the value.


In [11]:
# from google.colab import drive
# drive.mount('/content/drive')

In [12]:
# Define a function for time-based splitting
def time_based_split(data, train_size=0.7, val_size=0.15):
    total_size = len(data)
    train_end = int(total_size * train_size)
    val_end = train_end + int(total_size * val_size)

    X_train = X.iloc[:train_end]
    y_train = y.iloc[:train_end]

    X_val = X.iloc[train_end:val_end]
    y_val = y.iloc[train_end:val_end]

    X_test = X.iloc[val_end:]
    y_test = y.iloc[val_end:]

    return X_train, X_val, X_test, y_train, y_val, y_test

X_train, X_val, X_test, y_train, y_val, y_test = time_based_split(data)


In [13]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fit on training data only
X_train_scaled = scaler.fit_transform(X_train)

# Transform validation and test sets
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Convert scaled arrays back to DataFrames for better visualization
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_val_scaled_df = pd.DataFrame(X_val_scaled, columns=X_val.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)



In [None]:

# copy and paste the scaled data
# data = pd.concat([X_train_scaled_df], axis=1)



# Save the prepared dataset
data.to_csv("Datasets Folder/data_preprocessed.csv")

data1 = pd.read_csv("Datasets Folder/data_preprocessed.csv")

# Drop unnecessary columns
if 'Unnamed: 0' in data1.columns:
    data1 = data1.drop(columns=['Unnamed: 0'])
data1

#print()
#print("Scaled Training Data:")
#print(X_train_scaled_df.head())



Unnamed: 0,timestamp,value,month,day,hour,minute,day_of_week,isWeekend,Holiday,prep,...,isHoliday,TotalMinutes,season,hour_sin,hour_cos,day_of_week_sin,day_of_week_cos,value_lag_1,value_lag_2,value_lag_3
0,2014-07-01 00:00:00,10844.0,7,1,0,0,Tuesday,False,No Public Holiday,0.00,...,False,0,Summer,0.000000,1.000000,0.781831,0.623490,0.0,0.0,0.0
1,2014-07-01 00:30:00,8127.0,7,1,0,30,Tuesday,False,No Public Holiday,0.00,...,False,30,Summer,0.000000,1.000000,0.781831,0.623490,10844.0,0.0,0.0
2,2014-07-01 01:00:00,6210.0,7,1,1,0,Tuesday,False,No Public Holiday,0.00,...,False,60,Summer,0.258819,0.965926,0.781831,0.623490,8127.0,10844.0,0.0
3,2014-07-01 01:30:00,4656.0,7,1,1,30,Tuesday,False,No Public Holiday,0.00,...,False,90,Summer,0.258819,0.965926,0.781831,0.623490,6210.0,8127.0,10844.0
4,2014-07-01 02:00:00,3820.0,7,1,2,0,Tuesday,False,No Public Holiday,0.00,...,False,120,Summer,0.500000,0.866025,0.781831,0.623490,4656.0,6210.0,8127.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10316,2015-01-31 22:00:00,25721.0,1,31,22,0,Saturday,True,Martin Luther King Jr. Day,0.00,...,False,1320,Winter,-0.500000,0.866025,-0.974928,-0.222521,24670.0,23719.0,23291.0
10317,2015-01-31 22:30:00,27309.0,1,31,22,30,Saturday,True,Martin Luther King Jr. Day,0.00,...,False,1350,Winter,-0.500000,0.866025,-0.974928,-0.222521,25721.0,24670.0,23719.0
10318,2015-01-31 23:00:00,26591.0,1,31,23,0,Saturday,True,Martin Luther King Jr. Day,0.00,...,False,1380,Winter,-0.258819,0.965926,-0.974928,-0.222521,27309.0,25721.0,24670.0
10319,2015-01-31 23:30:00,26288.0,1,31,23,30,Saturday,True,Martin Luther King Jr. Day,0.00,...,False,1410,Winter,-0.258819,0.965926,-0.974928,-0.222521,26591.0,27309.0,25721.0


: 