In [None]:
import re
import pandas as pd

def remove_unnamed_columns(df):
    # Check for columns with names like "Unnamed: 0", "Unnamed: 1", etc.
    unnamed_columns = [col for col in df.columns if re.match(r'Unnamed:\s*\d+', col)]

    # If there are such columns, drop them
    if unnamed_columns:
        df = df.drop(columns=unnamed_columns)

    # Reset the index if needed
    df = df.reset_index(drop=True)
    
    return df

# Usage example:
# df = remove_unnamed_columns(your_dataframe)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Sample time series data
data = pd.read_csv('./data/aggregated_data.csv')  # Replace 'your_data.csv' with your data file

# Specify the date format 'DD/MM/YYYY' explicitly
data['Date'] = pd.to_datetime(data['Date'], format='%d/%m/%Y')

# Set 'Date' as the index for time series data
data.set_index('Date', inplace=True)

data = remove_unnamed_columns(data)

plt.figure(figsize=(12, 6))
sns.lineplot(x=data.index, y='TotalPrice', data=data, color='blue', marker='o', linestyle='-', markersize=5)
plt.title('Time Series Data')
plt.xlabel('Date')
plt.ylabel('TotalPrice')
plt.grid(True)  # Add grid lines
plt.legend(['TotalPrice'])
plt.tight_layout()  # Ensure labels and plot elements fit nicely
plt.show()


In [None]:
import pandas as pd
import numpy as np
from prophet import Prophet
from sklearn.metrics import mean_absolute_error, mean_squared_error


data = pd.read_csv('./data/aggregated_data.csv')
data = remove_unnamed_columns(data)

# Specify the training and testing split percentage
train_percent = 0.8  # 80% for training, 20% for testing
total_days = len(data)

# Calculate the number of days for training and testing
train_rows = int(total_days * train_percent)

# Split the data into training and testing based on the calculated days
train_data = data[:train_rows]
test_data = data[train_rows:]

# Convert the 'Date' column in test_data to datetime
test_data['Date'] = pd.to_datetime(test_data['Date'], format='%d/%m/%Y')

# Rename 'Date' and 'TotalPrice' columns to 'ds' and 'y'
train_data = train_data.rename(columns={'Date': 'ds', 'TotalPrice': 'y'})
train_data['ds'] = pd.to_datetime(train_data['ds'], format='%d/%m/%Y')

test_data = test_data.reset_index(drop=True)


In [None]:
# Create and fit the Prophet model using training data
model = Prophet()
model.fit(train_data)

# Make predictions for the entire next year
future = model.make_future_dataframe(periods=365)
forecast = model.predict(future)

forecast['ds'] = pd.to_datetime(forecast['ds'], format='%d/%m/%Y')

# Find the maximum date in the 'Date' column of the train_data DataFrame
max_train_date = train_data['ds'].max()
# Filter the forecast DataFrame to include dates greater than the max_train_date
eval_forecast = forecast[forecast['ds'] >= max_train_date]

# Reset the index of eval_forecast
eval_forecast = eval_forecast.reset_index(drop=True)
# Filter eval_forecast to match the date range of test_data
eval_forecast = eval_forecast[eval_forecast['ds'].isin(test_data['Date'])]
eval_forecast = eval_forecast.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)






In [None]:
import os

# Determine the next available experiment number
exp_number = 1
while os.path.exists(f"./exp/prophet_{exp_number}"):
    exp_number += 1

# Create a directory for the experiment
exp_directory = f"./exp/prophet_{exp_number}"
os.makedirs(exp_directory)

forecast.to_csv(exp_directory+'/prophet_forecast.csv')
train_data.to_csv(exp_directory+'/train_data.csv')
test_data.to_csv(exp_directory+'/test_data.csv')

In [None]:
import matplotlib.pyplot as plt



# Merge the filtered test_data and forecast based on the 'Date' column
merged_data = pd.merge(test_data, eval_forecast[['ds', 'yhat']], left_on='Date', right_on='ds', how='left')

plt.figure(figsize=(12, 6))
plt.plot(test_data['Date'], test_data['TotalPrice'], label='Actual', color='b')
plt.plot(merged_data['ds'], merged_data['yhat'], label='Predicted', color='r')
plt.title('Actual vs. Predicted Sales Over Time')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend()
plt.savefig(os.path.join(exp_directory, 'evaluate_plot.png'), dpi=500, bbox_inches='tight')  # Save the plot
plt.show()


# Calculate evaluation metrics
mae = mean_absolute_error(merged_data['TotalPrice'], merged_data['yhat'])
mse = mean_squared_error(merged_data['TotalPrice'], merged_data['yhat'])
rmse = np.sqrt(mse)


print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

# Calculate the absolute percentage error for each data point
test_data['APE'] = np.abs((test_data['TotalPrice'] - merged_data['yhat']) / test_data['TotalPrice']) * 100
# Calculate the mean of the APE values to get MAPE
mape = test_data['APE'].mean()
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")


# Save the evaluation metrics to a text file in exp_directory
with open(os.path.join(exp_directory, 'evaluation_metrics.txt'), 'w') as file:
    file.write(f"Train, Test split: {train_percent*100}% : {100-train_percent*100}%\n")
    file.write(f"")
    file.write(f"Mean Absolute Error (MAE): {mae:.2f}\n")
    file.write(f"Mean Squared Error (MSE): {mse:.2f}\n")
    file.write(f"Root Mean Squared Error (RMSE): {rmse:.2f}\n")
    file.write(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%\n")
    
    

In [None]:
import matplotlib.pyplot as plt

# Plot the yearly forecast with a larger figure size and a professional style
fig = model.plot(forecast, xlabel='Year', ylabel='Total Price', figsize=(16, 8))
plt.title('Yearly Sales Forecast', fontsize=16)

# To show the plot, you may need to use plt.show() in some environments
plt.savefig(os.path.join(exp_directory, 'sales_plot.png'), dpi=500, bbox_inches='tight')  # Save the plot
plt.show()


In [None]:
import pandas as pd

# Convert 'Date' column to datetime
forecast['ds'] = pd.to_datetime(forecast['ds'])
train_data['ds'] = pd.to_datetime(train_data['ds'])
test_data['Date'] = pd.to_datetime(test_data['Date'])

# Extract year and month from 'Date'
forecast['Year'] = forecast['ds'].dt.year
forecast['Month'] = forecast['ds'].dt.month
train_data['Year'] = train_data['ds'].dt.year
train_data['Month'] = train_data['ds'].dt.month
test_data['Year'] = test_data['Date'].dt.year
test_data['Month'] = test_data['Date'].dt.month

# Group by year and month, then sum the 'TotalPrice'
df_monthly_total_forecast = forecast.groupby(['Year', 'Month'])['yhat'].mean().reset_index()

df_monthly_total_train = train_data.groupby(['Year', 'Month'])['y'].mean().reset_index()

df_monthly_total_test = test_data.groupby(['Year', 'Month'])['TotalPrice'].mean().reset_index()

# Rename the columns
df_monthly_total_forecast.columns = ['Year', 'Month', 'TotalPrice']
df_monthly_total_train.columns = ['Year', 'Month', 'TotalPrice']
df_monthly_total_test.columns = ['Year', 'Month', 'TotalPrice']


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming you have three DataFrames: df_monthly_total_forecast, df_monthly_total_train, and df_monthly_total_test
# Each DataFrame should have columns: ['Year', 'Month', 'TotalPrice']

# Combine the data from the three DataFrames (if needed)
# For example, if you want to plot all three on the same graph:
combined_data = pd.concat([df_monthly_total_train, df_monthly_total_test], ignore_index=True)

# Create a single Date column by combining Year and Month
combined_data['Date'] = pd.to_datetime(combined_data[['Year', 'Month']].assign(day=1))
df_monthly_total_forecast['Date'] = pd.to_datetime(df_monthly_total_forecast[['Year', 'Month']].assign(day=1))

# Sort the data by Date
combined_data = combined_data.sort_values('Date')
df_monthly_total_forecast = df_monthly_total_forecast.sort_values('Date')

# Initialize the plot
plt.figure(figsize=(12, 6))

# Create a color palette
palette = sns.color_palette('Set1', n_colors=2)

# Plot the data
sns.lineplot(data=combined_data, x='Date', y='TotalPrice', label='Actual', color='dodgerblue', linewidth=2)
sns.lineplot(data=df_monthly_total_forecast, x='Date', y='TotalPrice', label='Forecast', color='tomato', linewidth=2)

# Customize the plot
plt.title('Monthly Total Price Time Series', fontsize=16)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Total Price', fontsize=12)
plt.xticks(rotation=45)
plt.legend(fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)

# Save the plot to a file (optional)
plt.savefig(os.path.join(exp_directory, 'mean_quartile_plot.png'), dpi=500, bbox_inches='tight')  # Save the plot

# Show the plot
plt.show()
