# Linear Regression Version

In [90]:
# Import required libraries
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

## Load the Data

In [91]:
# Read the CPI data
file_path = Path("./Resource/CPI_report_year_month_only.csv")
df_CPI = pd.read_csv(file_path)

# Display sample data
df_CPI

Unnamed: 0,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,1913,9.800,9.800,9.800,9.800,9.700,9.800,9.900,9.900,10.000,10.000,10.100,10.000
1,1914,10.000,9.900,9.900,9.800,9.900,9.900,10.000,10.200,10.200,10.100,10.200,10.100
2,1915,10.100,10.000,9.900,10.000,10.100,10.100,10.100,10.100,10.100,10.200,10.300,10.300
3,1916,10.400,10.400,10.500,10.600,10.700,10.800,10.800,10.900,11.100,11.300,11.500,11.600
4,1917,11.700,12.000,12.000,12.600,12.800,13.000,12.800,13.000,13.300,13.500,13.500,13.700
...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,2019,251.712,252.776,254.202,255.548,256.092,256.143,256.571,256.558,256.759,257.346,257.208,256.974
107,2020,257.971,258.678,258.115,256.389,256.394,257.797,259.101,259.918,260.280,260.388,260.229,260.474
108,2021,261.582,263.014,264.877,267.054,269.195,271.696,273.003,273.567,274.310,276.589,277.948,278.802
109,2022,281.148,283.716,287.504,289.109,292.296,296.311,296.276,296.171,296.808,298.012,297.711,296.797


## Prepare the Data to Fit the Linear Regression Model

In [92]:
# Get the year and month as the feature data
date = []
for i in df_CPI["Year"]:
    for j in range(1,13):
        date.append(f"{i}_{j}")

# It is a bit weird here, since I look at the practice examples and search online that the X value has to be numerical, 
# and I did not find a good way to represent year and month as numerical.
# So I use 1 to 1332 to represent it for now,
# where 1 stand for 1913 Jan, 1332 stand for 2023 Dec
X = np.array(range(1,1333)).reshape(-1, 1)

In [93]:
# Get the CPI value as target value
y = np.array([df_CPI.drop(columns="Year")]).reshape(-1,1)

In [94]:
# Check the data type
print(f'X date type: {X.dtype}')
print(f'y date type: {y.dtype}')

X date type: int32
y date type: float64


## Build the Linear Regression Model with Train data

In [95]:
# Create a model with scikit-learn
model = LinearRegression()

In [96]:
# Fit the data into the model
model.fit(X, y)

In [97]:
# Display the slope
print(f"Model's slope: {model.coef_[0][0]}")

Model's slope: 0.20297179818133607


In [98]:
# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_[0]}")

Model's y-intercept: -48.023038322695314


In [99]:
# Display the model's best fit line formula
print(f"Model's formula: y = {model.intercept_[0]} + {model.coef_[0][0]}X")

Model's formula: y = -48.023038322695314 + 0.20297179818133607X


## Assess the Linear Regression Model

In [100]:
# Make predictions using the X set
predicted_y_values = model.predict(X)

In [101]:
# Compute metrics for the linear regression model: score, r2, mse, rmse, std
score = model.score(X, y, sample_weight=None)
r2_LR = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2_LR}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.8372613023481741.
The r2 is 0.8372613023481741.
The mean squared error is 1183.935858987904.
The root mean squared error is 34.408369025397064.
The standard deviation is 85.29403680892737.


## Build the Ridge Regression Model with Train data

In [102]:
from sklearn.linear_model import Ridge
# Create a model with scikit-learn
model = Ridge()

In [103]:
# Fit the data into the model
model.fit(X, y)

In [104]:
# Display the slope
print(f"Model's slope: {model.coef_[0][0]}")

Model's slope: 0.20297179715070202


In [105]:
# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_[0]}")

Model's y-intercept: -48.023037635777726


In [106]:
# Display the model's best fit line formula
print(f"Model's formula: y = {model.intercept_[0]} + {model.coef_[0][0]}X")

Model's formula: y = -48.023037635777726 + 0.20297179715070202X


## Assess the Ridge Regression Model

In [107]:
# Make predictions using the X set
predicted_y_values = model.predict(X)

In [108]:
# Compute metrics for the linear regression model: score, r2, mse, rmse, std
score = model.score(X, y, sample_weight=None)
r2_RR = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2_RR}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.8372613023481741.
The r2 is 0.8372613023481741.
The mean squared error is 1183.9358589879046.
The root mean squared error is 34.40836902539707.
The standard deviation is 85.29403680892737.


## Build the Lasso Regression Model with Train data

In [109]:
from sklearn.linear_model import Lasso
# Create a model with scikit-learn
model = Lasso()

In [110]:
# Fit the data into the model
model.fit(X, y)

## Assess the Lasso Regression Model

In [None]:
# Make predictions using the X set
predicted_y_values = model.predict(X)

In [None]:
# Compute metrics for the linear regression model: score, r2, mse, rmse, std
score = model.score(X, y, sample_weight=None)
r2_LaR = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2_LaR}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.8372613014184893.
The r2 is 0.8372613014184893.
The mean squared error is 1183.9358657514283.
The root mean squared error is 34.40836912368019.
The standard deviation is 85.29403680892737.


## Build the RandomForestRegressor Model with Train data

In [None]:
from sklearn.ensemble import RandomForestRegressor
# Create a model with scikit-learn
model = RandomForestRegressor(n_estimators=100, random_state=78)

In [None]:
# Fit the data into the model
model.fit(X, y)

  model.fit(X, y)


## Assess the RandomForestRegressor Model

In [None]:
# Make predictions using the X set
predicted_y_values = model.predict(X)

In [None]:
print(predicted_y_values)

[  9.8       9.8       9.8     ... 307.60993 307.15727 306.92427]


In [None]:
# Compute metrics for the linear regression model: score, r2, mse, rmse, std
score = model.score(X, y, sample_weight=None)
r2_LR = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2_LR}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.9999974385376819.
The r2 is 0.9999974385376819.
The mean squared error is 0.018634824621694142.
The root mean squared error is 0.13650943052292813.
The standard deviation is 85.29403680892737.


In [None]:
# The X for future 80 years
X_to_predict = np.array(range(1333, 2293)).reshape(-1,1)

In [None]:
# Make predictions using the X set
predicted_y_values = model.predict(X_to_predict)

In [None]:
print(predicted_y_values)

[306.92427 306.92427 306.92427 306.92427 306.92427 306.92427 306.92427
 306.92427 306.92427 306.92427 306.92427 306.92427 306.92427 306.92427
 306.92427 306.92427 306.92427 306.92427 306.92427 306.92427 306.92427
 306.92427 306.92427 306.92427 306.92427 306.92427 306.92427 306.92427
 306.92427 306.92427 306.92427 306.92427 306.92427 306.92427 306.92427
 306.92427 306.92427 306.92427 306.92427 306.92427 306.92427 306.92427
 306.92427 306.92427 306.92427 306.92427 306.92427 306.92427 306.92427
 306.92427 306.92427 306.92427 306.92427 306.92427 306.92427 306.92427
 306.92427 306.92427 306.92427 306.92427 306.92427 306.92427 306.92427
 306.92427 306.92427 306.92427 306.92427 306.92427 306.92427 306.92427
 306.92427 306.92427 306.92427 306.92427 306.92427 306.92427 306.92427
 306.92427 306.92427 306.92427 306.92427 306.92427 306.92427 306.92427
 306.92427 306.92427 306.92427 306.92427 306.92427 306.92427 306.92427
 306.92427 306.92427 306.92427 306.92427 306.92427 306.92427 306.92427
 306.9