# Business Understanding

Purpose: Model the relationship between the target variable, Test_Score, and the independent variable(s): Hours_Studied.



# Library Loading

Import the Libraries we need


In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
## NOTE science kit learn requires numerical variables. It will not work with categorical variables that uses string variables.
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, root_mean_squared_error

# Data Loading

Setup the initial data for our model

In [None]:
# Data
data = {
    "Hours_Studied": [1, 2, 3, 4, 5],
    "Test_Score": [50, 56, 60, 63, 68]
}
df = pd.DataFrame(data)
df.head()


# Data Understanding: Exploratory Data Analysis

In [None]:
df.describe()

# Data Preparation

Setup our variables

In [None]:
# Predictor and response variables
X = df[['Hours_Studied']] # also called "Independent Variables"; there can be many variables, but there is only one for this example
y = df['Test_Score']      # also called "Target Variabele" or "Dependent Variable"; there can be only one


# Modeling

Initialize the model

In [None]:
# Model fitting
model = LinearRegression()
model.fit(X, y)


Calculate slope and intercept

In [None]:
# Slope and intercept
slope = model.coef_[0]
intercept = model.intercept_
print("Slope (beta_1):", slope)
print("Intercept (beta_0):", intercept)


# Evaluation

Test predicting based on the intial hour studied

In [None]:
# Predictions
predictions = model.predict(X)
print("Predicted Test Scores:", predictions)


Plot the model

In [None]:
# Plotting
plt.scatter(X, y, color='blue', label='Actual Data')
plt.plot(X, predictions, color='red', label='OLS Regression Line')
plt.xlabel("Hours Studied")
plt.ylabel("Test Score")
plt.legend()
plt.show()


Now lets give the model a new data point and test where it fits

In [None]:
# New data point
new_hours_studied = [[6]]  # Input should be in a 2D array format for sklearn

# Predict the test score for the new data point
predicted_score = model.predict(new_hours_studied)
print("Predicted Test Score for 6 hours studied:", predicted_score[0])


Lets visualize where our new data is predicted to fit

In [None]:
plt.scatter(X, y, color='blue', label='Actual Data')
plt.plot(X, predictions, color='red', label='OLS Regression Line')
plt.scatter(new_hours_studied, predicted_score, color='green', marker='x', s=100, label='New Prediction')
plt.xlabel("Hours Studied")
plt.ylabel("Test Score")
plt.legend()
plt.show()


Calculate the residuals for the training data (r1, r2)

In [None]:
# Calculate residuals
residuals = y - predictions
print("Residuals:", residuals)


Summarize the residuals(Mean and Standard Deviation)

In [None]:
# Summary statistics for residuals
mean_residual = residuals.mean()
std_residual = residuals.std()

print("Mean Residual:", mean_residual)
print("Standard Deviation of Residuals:", std_residual)


In [None]:
# Calculate R-squared
r2 = r2_score(y, predictions)
print("R-squared:", r2)


In [None]:
# Understandable R-Squared: report percentage by multiplying r2 by 100 & rounding to 2 decimal places
r2_percentage = round(r2 * 100, 2)
print("R-squared percentage: ", r2_percentage)


In [None]:
# Calculate MAE for the model rounded to 4 decimal places
mae = mean_absolute_error(y, predictions)
print(f"Mean Absolute Error (MAE): {mae:.4f}")


In [None]:
# Calculate MAE for the model rounded to 4 decimal places
mse = mean_squared_error(y, predictions)
print(f"Mean Squared Error (MSE): {mse:.4f}")


In [None]:
# Calculate MAPE for the model rounded to 4 decimal places
mape = mean_absolute_percentage_error(y, predictions)
print(f"Mean Absolute Percent Error (MAPE): {mape:.4f}")


In [None]:
# Calculate RMSE for the model rounded to 4 decimal places
rmse = root_mean_squared_error(y, predictions)
print(f"Roote Mean Squared Error (RMSE): {rmse:.4f}")
