In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.linear_model import LinearRegression

## Data Loading and Visualization

In [2]:
# Read inflation data
file_path = Path("C:/Users/vdumlao/Documents/GitHub/Project_4/Resources/SeriesReport-20240204220554_5aaed8.csv")
df_inflation = pd.read_csv(file_path)
# df_inflation = df_inflation.set_index("Year")

# Display sample data
df_inflation.head()

Unnamed: 0,Year,Annual
0,1913,9.9
1,1914,10.0
2,1915,10.1
3,1916,10.9
4,1917,12.8


In [3]:
# Create a scatter plot of years versus the inflation information
inflation_plot = df_inflation.hvplot.scatter(
    x="Year",
    y="Annual",
    title="Inflation",
    color="blue"
)
inflation_plot

## Data Preparation

In [4]:
# Reformat data of the independent variable X as a single-column array
X = df_inflation["Year"].values.reshape(-1, 1)

# Display sample data
X[:5]

array([[1913],
       [1914],
       [1915],
       [1916],
       [1917]], dtype=int64)

In [5]:
# The shape of X with a single feature (column)
X.shape

(111, 1)

In [6]:
# Create an array for the dependent variable y
y = df_inflation["Annual"]
y.shape

(111,)

## Building the Linear Regression Model

In [7]:
# Create a model with scikit-learn
model = LinearRegression()

In [8]:
# Fit the data into the model
model.fit(X, y)

In [9]:
# Display the slope
print(f"Model's slope: {model.coef_}")

Model's slope: [2.43567826]


In [10]:
# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")

Model's y-intercept: -4706.154373698375


In [11]:
# Display the model's best fit line formula
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")

Model's formula: y = -4706.154373698375 + 2.435678264303265X


## Plot the Best Fit Line for the Inflation Prediction Model

In [12]:
# Make predictions using the X set
predicted_y_values = model.predict(X)

In [13]:
# Create a copy of the original data
df_inflation_predicted = df_inflation.copy()

# Add a column with the predicted sales values
df_inflation_predicted["inflation_predicted"] = predicted_y_values

# Display sample data
df_inflation_predicted.head()

Unnamed: 0,Year,Annual,inflation_predicted
0,1913,9.9,-46.701854
1,1914,10.0,-44.266176
2,1915,10.1,-41.830498
3,1916,10.9,-39.394819
4,1917,12.8,-36.959141


In [14]:
# Create a line plot of ads versus the predicted sales values
best_fit_line = df_inflation_predicted.hvplot.line(
    x = "Year",
    y = "inflation_predicted",
    color = "red"
)
best_fit_line

In [15]:
# Superpose the original data and the best fit line
inflation_plot * best_fit_line

## Make Manual Predictions

In [16]:
# Display the formula to predict the inflaction
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}")

# Predict inflation
y_predict = model.intercept_ + model.coef_[0]

# Display the prediction
print(f"Predicted inflation: {y_predict:.2f}")

Model's formula: y = -4706.154373698375 + 2.435678264303265
Predicted inflation: -4703.72


## Make Predictions Using the `predict` Function

In [17]:
# Create an array to predict sales for 100, 150, 200, 250, and 300 ads
X_inflation = np.array([2024, 2025, 2026, 2027, 2028])

# Format the array as a one-column array
X_inflation = X_inflation.reshape(-1,1)

# Display sample data
X_inflation

array([[2024],
       [2025],
       [2026],
       [2027],
       [2028]])

In [18]:
# Predict sales for 100, 150, 200, 250, and 300 ads
predicted_inflation = model.predict(X_inflation)

In [19]:
# Create a DataFrame for the predicted sales
df_predicted_inflation = pd.DataFrame(
    {
        "ads": X_inflation.reshape(1, -1)[0],
        "predicted_inflation": predicted_inflation
    }
)

# Display data
df_inflation_predicted

Unnamed: 0,Year,Annual,inflation_predicted
0,1913,9.900,-46.701854
1,1914,10.000,-44.266176
2,1915,10.100,-41.830498
3,1916,10.900,-39.394819
4,1917,12.800,-36.959141
...,...,...,...
106,2019,255.657,211.480042
107,2020,258.811,213.915720
108,2021,270.970,216.351398
109,2022,292.655,218.787077


## Assess the Linear Regression Model

In [20]:
# Import relevant metrics from scikit-learn
from sklearn.metrics import mean_squared_error, r2_score

In [21]:
# Compute metrics for the linear regression model: score, r2, mse, rmse, std
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.837371329913151.
The r2 is 0.837371329913151.
The mean squared error is 1182.9008015048475.
The root mean squared error is 34.3933249556487.
The standard deviation is 85.28558009161256.
