PreProcessing Time_Series DataSet.

In [None]:
# Importing necessary libraries
import numpy as np
import pandas as pd

# Load the dataset
# Assuming you have a CSV file named 'time_series_data.csv' with columns 'Date', 'Time', and 'Power'
data = pd.read_csv('time_series_data.csv')

# Combine date and time columns into a single datetime column
data['DateTime'] = pd.to_datetime(data['Date'] + ' ' + data['Time'])

# Drop the original 'Date' and 'Time' columns
data.drop(['Date', 'Time'], axis=1, inplace=True)

# Set 'DateTime' column as the index
data.set_index('DateTime', inplace=True)

# Sort the index in ascending order (if not already sorted)
data = data.sort_index()

# Handling Missing Values
# Check for missing values
missing_values = data.isnull().sum()
print("Missing Values:")
print(missing_values)

# Fill missing values using forward fill (replace NaN values with the last valid observation)
data = data.fillna(method='ffill')

# Resampling
# Resample the data to a daily frequency ('D') and compute the sum of power consumption for each day
data_resampled_daily = data.resample('D').sum()

# Resample the data to a monthly frequency ('M') and compute the mean of power consumption for each month
data_resampled_monthly = data.resample('M').mean()

# Scaling
# Apply Min-Max scaling to the power consumption values
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data)

# Convert the scaled data back to a DataFrame
data_scaled = pd.DataFrame(data_scaled, columns=data.columns, index=data.index)

# Plot the original and preprocessed data
import matplotlib.pyplot as plt

plt.figure(figsize=(15, 6))
plt.plot(data.index, data['Power'], label='Original Data')
plt.title('Original Power Consumption Data')
plt.xlabel('DateTime')
plt.ylabel('Power (kWh)')
plt.legend()
plt.show()

plt.figure(figsize=(15, 6))
plt.plot(data_resampled_daily.index, data_resampled_daily['Power'], label='Daily Resampled Data')
plt.title('Resampled Daily Power Consumption Data')
plt.xlabel('DateTime')
plt.ylabel('Power (kWh)')
plt.legend()
plt.show()

plt.figure(figsize=(15, 6))
plt.plot(data_resampled_monthly.index, data_resampled_monthly['Power'], label='Monthly Resampled Data')
plt.title('Resampled Monthly Power Consumption Data')
plt.xlabel('DateTime')
plt.ylabel('Power (kWh)')
plt.legend()
plt.show()

plt.figure(figsize=(15, 6))
plt.plot(data_scaled.index, data_scaled['Power'], label='Scaled Data')
plt.title('Scaled Power Consumption Data')
plt.xlabel('DateTime')
plt.ylabel('Scaled Power')
plt.legend()
plt.show()


Detailed python code (.ipynb) for singular spectrum analysis of two months electricity consumption data having power in KWh to forecast the next month's power consumption. Including preprocessing the data to fit the model. Show accuracy of the forecast with respect to the original Third month's power consumption data

In [None]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

# Singular Spectrum Analysis implementation
def ssa_forecast(series, window_size, forecast_steps):
    def embed_series(series, window_size):
        N = len(series)
        K = N - window_size + 1
        embedded = np.zeros((window_size, K))
        for i in range(K):
            embedded[:, i] = series[i:i+window_size]
        return embedded

    def forecast_step(X, last_k):
        weights = X @ last_k
        return np.sum(weights * last_k[:, -1])

    def forecast(X, window_size, forecast_steps):
        last_k = X[:, -window_size:]
        forecasted_values = np.zeros(forecast_steps)
        for i in range(forecast_steps):
            forecasted_values[i] = forecast_step(X, last_k)
            last_k = np.roll(last_k, -1, axis=1)
            last_k[:, -1] = forecasted_values[i]
        return forecasted_values

    X = embed_series(series, window_size)
    forecast_values = forecast(X, window_size, forecast_steps)
    return forecast_values

# Load the data
# Assuming you have a CSV file named 'electricity_data.csv' with columns 'Date' and 'Power' for the two months' data
data = pd.read_csv('electricity_data.csv')
# Assuming 'Date' column is in datetime format, if not, you can convert it using:
# data['Date'] = pd.to_datetime(data['Date'])

# Preprocess the data
# Assuming the data is already preprocessed and is in the format needed for SSA.
# If not, you might need to perform some preprocessing steps like imputation of missing values, etc.

# Split the data into two months
first_month_data = data.loc[data['Date'].dt.month == 1]['Power'].values
second_month_data = data.loc[data['Date'].dt.month == 2]['Power'].values

# Concatenate two months data
two_months_data = np.concatenate([first_month_data, second_month_data])

# Apply MinMaxScaler to scale the data
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(two_months_data.reshape(-1, 1)).flatten()

# Singular Spectrum Analysis (SSA)
window_size = 30  # Choose an appropriate window size
forecast_steps = len(second_month_data)  # Forecast for the length of the second month
forecasted_values = ssa_forecast(scaled_data, window_size, forecast_steps)

# Inverse transform to get the forecasted values in original scale
forecasted_values = scaler.inverse_transform(forecasted_values.reshape(-1, 1)).flatten()

# Plotting the original and forecasted data
plt.figure(figsize=(10, 6))
plt.plot(np.arange(len(two_months_data)), two_months_data, label='Original Data')
plt.plot(np.arange(len(first_month_data), len(two_months_data) + forecast_steps), forecasted_values, label='Forecasted Data')
plt.xlabel('Time')
plt.ylabel('Power Consumption (KWh)')
plt.title('Electricity Consumption Forecast')
plt.legend()
plt.show()

# Evaluate forecast accuracy
third_month_data = data.loc[data['Date'].dt.month == 3]['Power'].values
mse = mean_squared_error(third_month_data, forecasted_values[:len(third_month_data)])
print(f"Mean Squared Error (MSE) between forecast and actual third month's data: {mse}")


 .ipynb code to preprocess a time series dataset with date [dd-mm-yyyy], time[hr-mins-seconds] and power consumed in Kwh. Write the code with all possible preprocessing techniques, scenarios and resampling. The preprocessed dataset should be used for Singular Spectrum Analysis model, ARIMA,SARIMA to forecast next month's power consumption BASED ON LAST  TWO MONTH'S  POWER CONSUMPTION.

In [None]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

# Load the dataset
data = pd.read_csv('time_series_data.csv')

# Preprocessing the dataset
# Assuming 'Date' and 'Time' columns are in datetime format, if not, you can convert them using:
# data['DateTime'] = pd.to_datetime(data['Date'] + ' ' + data['Time'])
# Set 'DateTime' column as the index
data['DateTime'] = pd.to_datetime(data['Date'] + ' ' + data['Time'])
data.set_index('DateTime', inplace=True)
# Drop 'Date' and 'Time' columns as they are no longer needed
data.drop(['Date', 'Time'], axis=1, inplace=True)

# Resample the data to hourly frequency
data_hourly = data.resample('H').sum()

# Fill missing values using forward fill method
data_hourly.fillna(method='ffill', inplace=True)

# Plot the resampled data
plt.figure(figsize=(10, 6))
plt.plot(data_hourly.index, data_hourly['Power'], label='Resampled Data')
plt.xlabel('Date')
plt.ylabel('Power Consumption (KWh)')
plt.title('Resampled Time Series Data')
plt.legend()
plt.show()

# Singular Spectrum Analysis (SSA)
# Implement SSA to decompose the time series into components
# You can use the previously provided SSA implementation

# ARIMA model
# Split the data into training and testing sets
train_size = int(len(data_hourly) * 0.8)
train, test = data_hourly.iloc[:train_size], data_hourly.iloc[train_size:]

# Fit ARIMA model
model_arima = ARIMA(train, order=(5,1,0))
arima_results = model_arima.fit()

# Forecast using ARIMA model
arima_forecast = arima_results.forecast(steps=len(test))[0]

# SARIMA model
# Fit SARIMA model
model_sarima = SARIMAX(train, order=(1, 1, 1), seasonal_order=(1, 1, 1, 24))
sarima_results = model_sarima.fit()

# Forecast using SARIMA model
sarima_forecast = sarima_results.forecast(steps=len(test))

# Evaluate forecast accuracy
mse_arima = mean_squared_error(test, arima_forecast)
mse_sarima = mean_squared_error(test, sarima_forecast)
print(f"MSE for ARIMA: {mse_arima}")
print(f"MSE for SARIMA: {mse_sarima}")
