In [2]:
# Download the prophet module if using it the first time
# pip install prophet

In [3]:
# Import required libraries
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from prophet import Prophet
from sklearn.metrics import mean_squared_error, r2_score
import datetime

## Load the Data

In [4]:
# Read the CPI data
file_path = Path("./Resource/CPI_report_origin.xlsx")
df_CPI = pd.read_excel(file_path, header=11)

# Display sample data
df_CPI

  warn("Workbook contains no default style, apply openpyxl's default")


Unnamed: 0,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Annual,HALF1,HALF2
0,1913,9.800,9.800,9.800,9.800,9.700,9.800,9.900,9.900,10.000,10.000,10.100,10.000,9.900,,
1,1914,10.000,9.900,9.900,9.800,9.900,9.900,10.000,10.200,10.200,10.100,10.200,10.100,10.000,,
2,1915,10.100,10.000,9.900,10.000,10.100,10.100,10.100,10.100,10.100,10.200,10.300,10.300,10.100,,
3,1916,10.400,10.400,10.500,10.600,10.700,10.800,10.800,10.900,11.100,11.300,11.500,11.600,10.900,,
4,1917,11.700,12.000,12.000,12.600,12.800,13.000,12.800,13.000,13.300,13.500,13.500,13.700,12.800,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,2019,251.712,252.776,254.202,255.548,256.092,256.143,256.571,256.558,256.759,257.346,257.208,256.974,255.657,254.412,256.903
107,2020,257.971,258.678,258.115,256.389,256.394,257.797,259.101,259.918,260.280,260.388,260.229,260.474,258.811,257.557,260.065
108,2021,261.582,263.014,264.877,267.054,269.195,271.696,273.003,273.567,274.310,276.589,277.948,278.802,270.970,266.236,275.703
109,2022,281.148,283.716,287.504,289.109,292.296,296.311,296.276,296.171,296.808,298.012,297.711,296.797,292.655,288.347,296.963


## Prepare the Data

In [5]:
# Create a list of date that represent the date the dataframe has
years = df_CPI['Year']
date = []
thirty_days_months = [4, 6, 9, 11]
for year in years:
    # The reason we assign it to the last day of the month is for the prophet model
    # since the prophet model has its own function for create future date
    # and the future dates are the last day of the month
    for j in range(1,13):
        if j == 2 and (year-1912) % 4 ==0:
            date.append(datetime.date(year, j, 29))
        elif j == 2 and (year-1912) % 4 !=0:
            date.append(datetime.date(year, j, 28))
        elif j in thirty_days_months:
            date.append(datetime.date(year, j, 30))
        else:
            date.append(datetime.date(year, j, 31))

In [6]:
# Get the CPI values as a list
cpi_value= np.array([df_CPI.drop(columns=["Year", "Annual", "HALF1", "HALF2"])]).reshape(-1,1)

In [7]:
# Create a reorganize dataframe for model use
df_reorganized_CPI = pd.DataFrame(date, columns=["Date"])
df_reorganized_CPI["CPI Value"] = cpi_value
df_reorganized_CPI

Unnamed: 0,Date,CPI Value
0,1913-01-31,9.800
1,1913-02-28,9.800
2,1913-03-31,9.800
3,1913-04-30,9.800
4,1913-05-31,9.700
...,...,...
1327,2023-08-31,307.026
1328,2023-09-30,307.789
1329,2023-10-31,307.671
1330,2023-11-30,307.051


In [8]:
# Check the data type
df_reorganized_CPI.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1332 entries, 0 to 1331
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       1332 non-null   object 
 1   CPI Value  1332 non-null   float64
dtypes: float64(1), object(1)
memory usage: 20.9+ KB


In [9]:
# Transform the date value to datetime form
df_reorganized_CPI["Date"] = pd.DatetimeIndex(df_reorganized_CPI["Date"])

In [10]:
# Check the data type
df_reorganized_CPI.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1332 entries, 0 to 1331
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       1332 non-null   datetime64[ns]
 1   CPI Value  1332 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 20.9 KB


In [11]:
# Rename the columns for the Prophet model
df_reorganized_CPI = df_reorganized_CPI.rename(columns={"Date": "ds", "CPI Value": "y"})

# Prophet

## Build the Prophet Model with Train data

In [12]:
# Create a model with scikit-learn
model = Prophet()

In [13]:
# Fit the data into the model
model.fit(df_reorganized_CPI)

18:52:35 - cmdstanpy - INFO - Chain [1] start processing
18:52:35 - cmdstanpy - INFO - Chain [1] done processing


<prophet.forecaster.Prophet at 0x271abfbe560>

In [14]:
# Assign the date column to X for test the r2 value
X = df_reorganized_CPI["ds"]

In [15]:
# Transform the X to dataframe for the model use
X = pd.DataFrame(df_reorganized_CPI["ds"])

In [16]:
# Get the CPI value from dataframe for testing the accuracy values
y = df_reorganized_CPI["y"]

## Assess the Prophet Model

In [17]:
# Make predictions using the X set
train_X_forecast = model.predict(X)

In [18]:
# Get the predicted train CPI values from the forecast
predicted_train_y_values = train_X_forecast["yhat"]

In [19]:
# Compute metrics for the linear regression model: score, r2, mse, rmse, std
r2_LR = r2_score(y, predicted_train_y_values)
mse = mean_squared_error(y, predicted_train_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The r2 is {r2_LR}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The r2 is 0.9983177155533306.
The mean squared error is 12.238741677107415.
The root mean squared error is 3.4983912984552505.
The standard deviation is 85.29403680892736.


## Predict the CPI for Future 80 Years

In [20]:
# Define X_future as the year values from 2024 to 2103
X_future = model.make_future_dataframe(periods=960, freq="M", include_history=False)
X_future


Unnamed: 0,ds
0,2024-01-31
1,2024-02-29
2,2024-03-31
3,2024-04-30
4,2024-05-31
...,...
955,2103-08-31
956,2103-09-30
957,2103-10-31
958,2103-11-30


In [21]:
# Predict the CPI values for future 80 years using the model
predicted_future_forecast = model.predict(X_future)

In [22]:
# Show the predicted future 80 years forecast
# ds: the date
# yhat: the predicted y value
# yhat_lower: the lower bound of the predicted y value
# yhat_upper: the upper bound of the predicted y value
predicted_future_forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]

Unnamed: 0,ds,yhat,yhat_lower,yhat_upper
0,2024-01-31,284.792081,280.175077,289.001618
1,2024-02-29,285.580900,281.148270,289.806483
2,2024-03-31,286.133906,281.945641,290.751574
3,2024-04-30,286.634446,282.317489,290.822314
4,2024-05-31,287.137308,282.700246,291.721537
...,...,...,...,...
955,2103-08-31,675.301974,494.058753,828.698069
956,2103-09-30,675.778127,495.104947,828.595013
957,2103-10-31,676.281283,494.202133,830.210497
958,2103-11-30,676.860228,496.900655,829.030579


In [23]:
# Get the predicted CPI values for future 80 years from the forecast
predicted_y = predicted_future_forecast["yhat"]

## Create Dataframe for Use

In [24]:
# Create the dataframe from 1913 to 2023
df_to_2023 = df_reorganized_CPI[["ds", "y"]]


In [25]:
# Create the dataframe from 1913 to 2023 with the predicted value
df_to_2023_predict = df_to_2023.copy()
df_to_2023_predict["Predicted CPI"] = predicted_train_y_values

In [26]:
# Create the dataframe from 2024 to 2103
df_to_2103 = pd.DataFrame(X_future, columns=["ds"])
df_to_2103["y"] = predicted_y



In [36]:
df_to_2103_copy = df_to_2103.copy()

In [43]:
df_to_2103_copy['year'] = df_to_2103_copy["ds"].dt.year
df_to_2103_copy.head()

Unnamed: 0,ds,y,year
0,2024-01-31,284.792081,2024
1,2024-02-29,285.5809,2024
2,2024-03-31,286.133906,2024
3,2024-04-30,286.634446,2024
4,2024-05-31,287.137308,2024


In [44]:
df_to_2103_annual = df_to_2103_copy.groupby('year')['y'].mean().reset_index()
df_to_2103_annual.head()


Unnamed: 0,year,y
0,2024,287.645327
1,2025,292.602764
2,2026,297.49999
3,2027,302.397131
4,2028,307.239115


In [46]:
df_to_2103_annual.to_csv("./Resource/df_to_2103_annual.csv", index=False)

In [28]:
# Create a dataframe from 2024 to 2103 which CPI values are annual
future_years = np.array(range(2024, 2104))


In [29]:
# Create a dataframe that include the year and CPI value from 1913 to 2103
df_all_year = pd.concat([df_to_2023, df_to_2103])


## Visualize the Data

In [30]:
# Create a scatter plot of year versus the CPI value
scatter_plot_for_year_vs_CPI = df_to_2023.hvplot.scatter(
    x="ds",
    xlabel="Date",
    y="y",
    ylabel="CPI Value",
    title="CPI Values Trend",
    color = "blue"
)
scatter_plot_for_year_vs_CPI

In [31]:
# Create a line plot to show the model
line_plot_model = df_to_2023_predict.hvplot.line(
    x="ds",
    xlabel="Date",
    y="Predicted CPI",
    color="red"
)
line_plot_model

In [32]:
# See the model predict with the origin data
scatter_plot_for_year_vs_CPI * line_plot_model

In [33]:
# Create the line plot for future 80 years
line_plot_future = df_to_2103.hvplot.line(
    x="ds",
    xlabel="Date",
    y="y",
    ylabel="CPI Value",
    color = "green"
) 
line_plot_future

In [34]:
# Combine the plots
scatter_plot_for_year_vs_CPI * line_plot_model * line_plot_future