In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from prophet import Prophet
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [7]:
data = pd.read_csv("outlier_removed.csv")
data

Unnamed: 0,ds,y,Max Temp (°C),Min Temp (°C),Mean Temp (°C),Sunshine Hours,Evaporation (mm),year,month,day
0,2014-01-01,0.0,34.0,23.0,28.5,7.2,4.0,2014,1,1
1,2014-01-02,0.0,35.0,24.0,29.5,6.6,4.0,2014,1,2
2,2014-01-03,0.0,32.0,23.0,27.5,7.2,6.0,2014,1,3
3,2014-01-04,0.0,33.0,21.0,27.0,6.1,4.0,2014,1,4
4,2014-01-05,0.0,31.5,22.0,26.8,7.3,4.0,2014,1,5
...,...,...,...,...,...,...,...,...,...,...
3647,2023-12-27,0.0,30.0,21.0,25.5,4.2,3.4,2023,12,27
3648,2023-12-28,0.0,31.0,22.0,26.5,5.5,4.1,2023,12,28
3649,2023-12-29,0.0,32.0,22.0,27.0,5.7,3.8,2023,12,29
3650,2023-12-30,0.0,32.0,21.0,26.5,4.1,3.2,2023,12,30


In [8]:
# Sample data setup (Ensure your 'data' dataframe is already defined)
# df = pd.read_csv('your_data.csv')

# Rename columns for Prophet
df = data.rename(columns={'DATE': 'ds', 'RAINFALL (mm)': 'y'})

# Split data into training (80%), testing (16%), and validation (4%) sets
train_size = 0.80
test_size = 0.16
validation_size = 0.04

train_end = int(train_size * len(df))
test_end = int((train_size + test_size) * len(df))

X_train = df.iloc[:train_end]
X_test = df.iloc[train_end:test_end]
X_val = df.iloc[test_end:]

In [12]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Select numeric columns
numeric_cols = X_train.select_dtypes(include=['float64', 'int64']).columns

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the numeric training data
scaler.fit(X_train[numeric_cols])

# Transform the numeric training data and convert back to DataFrame
X_train_scaled = pd.DataFrame(scaler.transform(X_train[numeric_cols]), columns=numeric_cols, index=X_train.index)

# Transform the numeric validation data and convert back to DataFrame
X_val_scaled = pd.DataFrame(scaler.transform(X_val[numeric_cols]), columns=numeric_cols, index=X_val.index)

# Transform the numeric test data and convert back to DataFrame
X_test_scaled = pd.DataFrame(scaler.transform(X_test[numeric_cols]), columns=numeric_cols, index=X_test.index)

# Combine the scaled numeric data with the non-numeric data
train_df = pd.concat([X_train[['ds']].reset_index(drop=True), X_train_scaled.reset_index(drop=True)], axis=1)
validation_df = pd.concat([X_val[['ds']].reset_index(drop=True), X_val_scaled.reset_index(drop=True)], axis=1)
test_df = pd.concat([X_test[['ds']].reset_index(drop=True), X_test_scaled.reset_index(drop=True)], axis=1)

In [14]:
# Initialize and fit the Prophet model on the training set
model = Prophet()
model.fit(train_df)

# Create a dataframe for future dates including the test and validation periods
# Note: Only the number of periods is passed to make_future_dataframe
future_periods = len(test_df) + len(validation_df)
future = model.make_future_dataframe(periods=future_periods)

# Predict future values
forecast = model.predict(future)

# Extract the forecasted values for the test period
forecast_test = forecast.iloc[train_end:train_end + len(test_df)]
predicted = forecast_test['yhat'].values

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(test_df['y'], predicted))

# Calculate MAE
mae = mean_absolute_error(test_df['y'], predicted)

# Calculate Mean Percentage Error (MPE)
def mean_percentage_error(y_true, y_pred):
    return np.mean((y_true - y_pred) / y_true) * 100

mpe = mean_percentage_error(test_df['y'].values, predicted)
print(f'RMSE: {rmse} \n MAE: {mae} \n Mean Percentage Error (MPE): {mpe:.2f}%')


13:47:52 - cmdstanpy - INFO - Chain [1] start processing
13:47:53 - cmdstanpy - INFO - Chain [1] done processing


RMSE: 0.7829010384831351 
 MAE: 0.43839537698043085 
 Mean Percentage Error (MPE): 86.31%


In [1]:
pip install streamlit


Note: you may need to restart the kernel to use updated packages.
