In [45]:
# Download the prophet module if using it the first time
# pip install prophet

In [46]:
# Import required libraries
import numpy as np
import pandas as pd
import sqlalchemy
import hvplot.pandas
from pathlib import Path
from sqlalchemy import URL, create_engine
from prophet import Prophet
from sklearn.metrics import mean_squared_error, r2_score
import datetime

## Load the Data

In [47]:
# Define url object
url_object = URL.create(
    "postgresql+psycopg2",
    username="postgres",
    password="postgres",  # plain (unescaped) text
    host="localhost",
    database="project4"
)

In [48]:
# Create a SQLAlchemy engine
engine = create_engine(url_object)
conn = engine.connect()
print(type(engine))

<class 'sqlalchemy.engine.base.Engine'>


In [49]:
# Query All Records in sql table
df_CPI = pd.read_sql('SELECT * FROM inflation_table',con=engine)

df_CPI.head()

Unnamed: 0,index,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Annual,HALF1,HALF2
0,0,1913,9.8,9.8,9.8,9.8,9.7,9.8,9.9,9.9,10.0,10.0,10.1,10.0,9.9,,
1,1,1914,10.0,9.9,9.9,9.8,9.9,9.9,10.0,10.2,10.2,10.1,10.2,10.1,10.0,,
2,2,1915,10.1,10.0,9.9,10.0,10.1,10.1,10.1,10.1,10.1,10.2,10.3,10.3,10.1,,
3,3,1916,10.4,10.4,10.5,10.6,10.7,10.8,10.8,10.9,11.1,11.3,11.5,11.6,10.9,,
4,4,1917,11.7,12.0,12.0,12.6,12.8,13.0,12.8,13.0,13.3,13.5,13.5,13.7,12.8,,


## Prepare the Data

In [50]:
# Create a list of date that represent the date the dataframe has
years = df_CPI['Year']
date = []
thirty_days_months = [4, 6, 9, 11]
for year in years:
    # The reason we assign it to the last day of the month is for the prophet model
    # since the prophet model has its own function for create future date
    # and the future dates are the last day of the month
    for j in range(1,13):
        if j == 2 and (year-1912) % 4 ==0:
            date.append(datetime.date(year, j, 29))
        elif j == 2 and (year-1912) % 4 !=0:
            date.append(datetime.date(year, j, 28))
        elif j in thirty_days_months:
            date.append(datetime.date(year, j, 30))
        else:
            date.append(datetime.date(year, j, 31))

In [51]:
# Get the CPI values as a list
cpi_value= np.array([df_CPI.drop(columns=["index","Year", "Annual", "HALF1", "HALF2"])]).reshape(-1,1)

In [52]:
# Create a reorganize dataframe for model use
df_reorganized_CPI = pd.DataFrame(date, columns=["Date"])
df_reorganized_CPI["CPI Value"] = cpi_value
df_reorganized_CPI

Unnamed: 0,Date,CPI Value
0,1913-01-31,9.800
1,1913-02-28,9.800
2,1913-03-31,9.800
3,1913-04-30,9.800
4,1913-05-31,9.700
...,...,...
1327,2023-08-31,307.026
1328,2023-09-30,307.789
1329,2023-10-31,307.671
1330,2023-11-30,307.051


In [53]:
# Check the data type
df_reorganized_CPI.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1332 entries, 0 to 1331
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       1332 non-null   object 
 1   CPI Value  1332 non-null   float64
dtypes: float64(1), object(1)
memory usage: 20.9+ KB


In [54]:
# Transform the date value to datetime form
df_reorganized_CPI["Date"] = pd.DatetimeIndex(df_reorganized_CPI["Date"])

In [55]:
# Check the data type
df_reorganized_CPI.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1332 entries, 0 to 1331
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       1332 non-null   datetime64[ns]
 1   CPI Value  1332 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 20.9 KB


In [56]:
# Rename the columns for the Prophet model
df_reorganized_CPI = df_reorganized_CPI.rename(columns={"Date": "ds", "CPI Value": "y"})

# Prophet

## Build the Prophet Model with Train data

In [57]:
# Create a model with scikit-learn
model = Prophet()

In [58]:
# Fit the data into the model
model.fit(df_reorganized_CPI)

19:34:20 - cmdstanpy - INFO - Chain [1] start processing
19:34:21 - cmdstanpy - INFO - Chain [1] done processing


<prophet.forecaster.Prophet at 0x1968a9dedd0>

In [59]:
# Assign the date column to X for test the r2 value
X = df_reorganized_CPI["ds"]

In [60]:
# Transform the X to dataframe for the model use
X = pd.DataFrame(df_reorganized_CPI["ds"])

In [61]:
# Get the CPI value from dataframe for testing the accuracy values
y = df_reorganized_CPI["y"]

## Assess the Prophet Model

In [62]:
# Make predictions using the X set
train_X_forecast = model.predict(X)

In [63]:
# Get the predicted train CPI values from the forecast
predicted_train_y_values = train_X_forecast["yhat"]

In [64]:
# Compute metrics for the linear regression model: score, r2, mse, rmse, std
r2_LR = r2_score(y, predicted_train_y_values)
mse = mean_squared_error(y, predicted_train_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The r2 is {r2_LR}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The r2 is 0.9983182452592712.
The mean squared error is 12.234888027871618.
The root mean squared error is 3.497840480621096.
The standard deviation is 85.29403680892736.


## Predict the CPI for Future 80 Years

In [65]:
# Define X_future as the year values from 2024 to 2103
X_future = model.make_future_dataframe(periods=960, freq="M", include_history=False)
X_future


Unnamed: 0,ds
0,2024-01-31
1,2024-02-29
2,2024-03-31
3,2024-04-30
4,2024-05-31
...,...
955,2103-08-31
956,2103-09-30
957,2103-10-31
958,2103-11-30


In [66]:
# Predict the CPI values for future 80 years using the model
predicted_future_forecast = model.predict(X_future)

In [67]:
# Show the predicted future 80 years forecast
# ds: the date
# yhat: the predicted y value
# yhat_lower: the lower bound of the predicted y value
# yhat_upper: the upper bound of the predicted y value
predicted_future_forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]

Unnamed: 0,ds,yhat,yhat_lower,yhat_upper
0,2024-01-31,284.799084,280.416006,289.329985
1,2024-02-29,285.586674,280.965568,290.128759
2,2024-03-31,286.146847,281.360422,290.982330
3,2024-04-30,286.640645,282.260618,291.275556
4,2024-05-31,287.142381,282.520985,291.886181
...,...,...,...,...
955,2103-08-31,675.368510,509.407993,838.007634
956,2103-09-30,675.847434,508.137058,841.811536
957,2103-10-31,676.329503,507.062328,840.725777
958,2103-11-30,676.935295,507.990685,843.508202


In [68]:
# Get the predicted CPI values for future 80 years from the forecast
predicted_y = predicted_future_forecast["yhat"]

## Create Dataframe for Use

In [69]:
# Create the dataframe from 1913 to 2023
df_to_2023 = df_reorganized_CPI[["ds", "y"]]


In [70]:
# Create the dataframe from 1913 to 2023 with the predicted value
df_to_2023_predict = df_to_2023.copy()
df_to_2023_predict["Predicted CPI"] = predicted_train_y_values

In [71]:
# Create the dataframe from 2024 to 2103
df_to_2103 = pd.DataFrame(X_future, columns=["ds"])
df_to_2103["y"] = predicted_y



In [72]:
df_to_2103_copy = df_to_2103.copy()

In [73]:
df_to_2103_copy['year'] = df_to_2103_copy["ds"].dt.year
df_to_2103_copy.head()

Unnamed: 0,ds,y,year
0,2024-01-31,284.799084,2024
1,2024-02-29,285.586674,2024
2,2024-03-31,286.146847,2024
3,2024-04-30,286.640645,2024
4,2024-05-31,287.142381,2024


In [74]:
df_to_2103_annual = df_to_2103_copy.groupby('year')['y'].mean().reset_index()
df_to_2103_annual.head()


Unnamed: 0,year,y
0,2024,287.652767
1,2025,292.611108
2,2026,297.509082
3,2027,302.406971
4,2028,307.249524


In [75]:
df_to_2103_annual.to_csv("./Resource/df_to_2103_annual.csv", index=False)

In [76]:
# Create a dataframe from 2024 to 2103 which CPI values are annual
future_years = np.array(range(2024, 2104))


In [77]:
# Create a dataframe that include the year and CPI value from 1913 to 2103
df_all_year = pd.concat([df_to_2023, df_to_2103])


## Visualize the Data

In [78]:
# Create a scatter plot of year versus the CPI value
scatter_plot_for_year_vs_CPI = df_to_2023.hvplot.scatter(
    x="ds",
    xlabel="Date",
    y="y",
    ylabel="CPI Value",
    title="CPI Values Trend",
    color = "blue"
)
scatter_plot_for_year_vs_CPI

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type


In [79]:
# Create a line plot to show the model
line_plot_model = df_to_2023_predict.hvplot.line(
    x="ds",
    xlabel="Date",
    y="Predicted CPI",
    color="red"
)
line_plot_model

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type


In [80]:
# See the model predict with the origin data
scatter_plot_for_year_vs_CPI * line_plot_model

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type


In [81]:
# Create the line plot for future 80 years
line_plot_future = df_to_2103.hvplot.line(
    x="ds",
    xlabel="Date",
    y="y",
    ylabel="CPI Value",
    color = "green"
) 
line_plot_future

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type


In [82]:
# Combine the plots
scatter_plot_for_year_vs_CPI * line_plot_model * line_plot_future

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type
