In [2]:
# Import required libraries
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

## Load the Data

In [3]:
# Read the CPI data
file_path = Path("./Resource/CPI_report_origin.xlsx")
df_CPI = pd.read_excel(file_path, header=11)

# Display sample data
df_CPI

  warn("Workbook contains no default style, apply openpyxl's default")


Unnamed: 0,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Annual,HALF1,HALF2
0,1913,9.800,9.800,9.800,9.800,9.700,9.800,9.900,9.900,10.000,10.000,10.100,10.000,9.900,,
1,1914,10.000,9.900,9.900,9.800,9.900,9.900,10.000,10.200,10.200,10.100,10.200,10.100,10.000,,
2,1915,10.100,10.000,9.900,10.000,10.100,10.100,10.100,10.100,10.100,10.200,10.300,10.300,10.100,,
3,1916,10.400,10.400,10.500,10.600,10.700,10.800,10.800,10.900,11.100,11.300,11.500,11.600,10.900,,
4,1917,11.700,12.000,12.000,12.600,12.800,13.000,12.800,13.000,13.300,13.500,13.500,13.700,12.800,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,2019,251.712,252.776,254.202,255.548,256.092,256.143,256.571,256.558,256.759,257.346,257.208,256.974,255.657,254.412,256.903
107,2020,257.971,258.678,258.115,256.389,256.394,257.797,259.101,259.918,260.280,260.388,260.229,260.474,258.811,257.557,260.065
108,2021,261.582,263.014,264.877,267.054,269.195,271.696,273.003,273.567,274.310,276.589,277.948,278.802,270.970,266.236,275.703
109,2022,281.148,283.716,287.504,289.109,292.296,296.311,296.276,296.171,296.808,298.012,297.711,296.797,292.655,288.347,296.963


## Prepare the Data

In [4]:
# Get the year values as the feature data
X = np.array([df_CPI["Year"]]).reshape(-1,1)

In [5]:
# Get the annual CPI value as target value
y = np.array([df_CPI["Annual"]]).reshape(-1,1)

# Linear Regression

## Build the Linear Regression Model with Train data

In [6]:
# Create a model with scikit-learn
model = LinearRegression()

In [7]:
# Fit the data into the model
model.fit(X, y)

In [8]:
# Display the slope
print(f"Model's slope: {model.coef_[0][0]}")

Model's slope: 2.4356782643032653


In [9]:
# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_[0]}")

Model's y-intercept: -4706.154373698376


In [10]:
# Display the model's best fit line formula
print(f"Model's formula: y = {model.intercept_[0]} + {model.coef_[0][0]}X")

Model's formula: y = -4706.154373698376 + 2.4356782643032653X


## Assess the Linear Regression Model

In [11]:
# Make predictions using the X set
predicted_y_values = model.predict(X)

In [12]:
# Compute metrics for the linear regression model: score, r2, mse, rmse, std
score = model.score(X, y, sample_weight=None)
r2_LR = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2_LR}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.837371329913151.
The r2 is 0.837371329913151.
The mean squared error is 1182.9008015048475.
The root mean squared error is 34.3933249556487.
The standard deviation is 85.28558009161256.


## Predict the CPI for Future 80 Years

In [18]:
# Define X_future as the year values from 2024 to 2103
X_future = np.array(range(2024, 2104)).reshape(-1,1)


In [19]:
# Predict the CPI values for future 80 years using the model
predicted_y = model.predict(X_future)

## Create Dataframe for Use

In [38]:
# Create the dataframe from 1913 to 2023
df_to_2023 = df_CPI[["Year", "Annual"]]


In [42]:
# Create the dataframe from 1913 to 2023 with the predicted value
df_to_2023_predict = df_to_2023.copy()
df_to_2023_predict["Predicted CPI"] = predicted_y_values

In [39]:
# Create the dataframe from 2024 to 2103
df_to_2103 = pd.DataFrame(X_future, columns=["Year"])
df_to_2103["Annual"] = predicted_y



In [40]:
# Create a dataframe that include the year and CPI value from 1913 to 2103
df_all_year = pd.concat([df_to_2023, df_to_2103])


## Visualize the Data

In [54]:
# Create a scatter plot of year versus the CPI value
scatter_plot_for_year_vs_CPI = df_to_2023.hvplot.scatter(
    x="Year",
    y="Annual",
    ylabel="CPI Value",
    title="CPI Values Trend",
    color = "blue"
)
scatter_plot_for_year_vs_CPI

In [44]:
# Create a line plot to show the model
line_plot_model = df_to_2023_predict.hvplot.line(
    x="Year",
    y="Predicted CPI",
    color="red"
)
line_plot_model

In [57]:
# See the model predict with the origin data
scatter_plot_for_year_vs_CPI * line_plot_model

In [55]:
# Create the line plot for future 80 years
line_plot_future = df_to_2103.hvplot.line(
    x="Year",
    y="Annual",
    color = "green"
) 
line_plot_future

In [56]:
# Combine the plots
scatter_plot_for_year_vs_CPI * line_plot_model * line_plot_future