In [1]:
import numpy as np
import pandas as pd 

import seaborn as sbn 
import matplotlib.pyplot as plt

import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_excel("MLR_Q08_TruckMaintenance.xlsx")
df.head()

Unnamed: 0,Truck,Maintenance Expense,Miles Driven,Age of Truck
0,1,908.56,10500,10
1,2,751.12,9700,7
2,3,793.55,9200,8
3,4,619.61,8300,9
4,5,380.11,6500,5


In [3]:
df.describe()

Unnamed: 0,Truck,Maintenance Expense,Miles Driven,Age of Truck
count,23.0,23.0,23.0,23.0
mean,12.0,519.349565,6321.73913,5.391304
std,6.78233,285.902217,3476.800583,3.257706
min,1.0,142.53,1100.0,1.0
25%,6.5,233.39,2850.0,2.5
50%,12.0,436.04,6600.0,5.0
75%,17.5,801.68,9750.0,8.0
max,23.0,915.53,10500.0,11.0


## Without standardizing X variables

In [4]:
Y = df[["Maintenance Expense"]]
X = df[['Miles Driven', 'Age of Truck']]
X = sm.add_constant(X) # adding a constan
reg_model = sm.OLS(Y,X).fit()
print(reg_model.summary())

                             OLS Regression Results                            
Dep. Variable:     Maintenance Expense   R-squared:                       0.951
Model:                             OLS   Adj. R-squared:                  0.946
Method:                  Least Squares   F-statistic:                     195.0
Date:                 Wed, 11 May 2022   Prob (F-statistic):           7.62e-14
Time:                         19:07:30   Log-Likelihood:                -127.47
No. Observations:                   23   AIC:                             260.9
Df Residuals:                       20   BIC:                             264.3
Df Model:                            2                                         
Covariance Type:             nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const           11.4184     29.191    

## With standardizing

In [6]:
Y = df[["Maintenance Expense"]]
X = df[['Miles Driven', 'Age of Truck']]

from sklearn.preprocessing import MinMaxScaler

X_std = (X - X.min()) / (X.max() - X.min())
X_std = sm.add_constant(X_std) # adding a constant
reg_model = sm.OLS(Y,X_std).fit()
print(reg_model.summary())

                             OLS Regression Results                            
Dep. Variable:     Maintenance Expense   R-squared:                       0.951
Model:                             OLS   Adj. R-squared:                  0.946
Method:                  Least Squares   F-statistic:                     195.0
Date:                 Wed, 11 May 2022   Prob (F-statistic):           7.62e-14
Time:                         19:08:21   Log-Likelihood:                -127.47
No. Observations:                   23   AIC:                             260.9
Df Residuals:                       20   BIC:                             264.3
Df Model:                            2                                         
Covariance Type:             nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const          100.4626     25.315    

### Conclusions:
  - The regresion equation is<br>
     **Maintenance Expense = 11.4184 + 0.0712 * Miles Driven + 10.7086 * Age of Truck**
  - Coefficient magnitude depends on the scale of variable, since the scale of Miles driven is typically in thousands and age of truck is mostly in single digits(max being 11), hence the coefficient for Miles is very small as compared to coefficient for Age. However once we standardize the X variables, then the magnitude for both the variables is comparable(Miles Driven = 669.4140, and Age = 107.0859)
  - R squared for this model is 95.1%, which means that the model is able to explain 95.1% variation in Y variable, using the X variables for the training data.