### Import packages

In [544]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

### Input Dataset

In [546]:
df = pd.read_csv('QB2022_MLR.csv')
df.head()

Unnamed: 0,Year,Player,Pass Yds,Yds/Att,Att,Cmp,Cmp %,TD,INT,Rate
0,2022,Jared Goff,4438,7.6,587,382,65.1,29,7,99.3
1,2022,Josh Allen,4283,7.6,567,359,63.3,35,14,96.6
2,2022,Geno Smith,4282,7.5,572,399,69.8,30,11,100.9
3,2022,Trevor Lawrence,4113,7.0,584,387,66.3,25,8,95.2
4,2022,Jalen Hurts,3701,8.0,460,306,66.5,22,6,101.6


In [547]:
df = df.drop(columns=['Year', 'Player'])
df

Unnamed: 0,Pass Yds,Yds/Att,Att,Cmp,Cmp %,TD,INT,Rate
0,4438,7.6,587,382,65.1,29,7,99.3
1,4283,7.6,567,359,63.3,35,14,96.6
2,4282,7.5,572,399,69.8,30,11,100.9
3,4113,7.0,584,387,66.3,25,8,95.2
4,3701,8.0,460,306,66.5,22,6,101.6
...,...,...,...,...,...,...,...,...
65,90,6.0,15,10,66.7,0,0,82.6
66,59,11.8,5,3,60.0,0,1,61.7
67,58,7.2,8,6,75.0,1,1,94.8
68,52,4.3,12,8,66.7,1,0,103.5


In [548]:
test_data = pd.read_csv('QB2022_MLR_test.csv')
test_data.head()

Unnamed: 0,Year,Player,Pass Yds,Yds/Att,Att,Cmp,Cmp %,TD,INT,Rate
0,2022,Patrick Mahomes,5250,8.1,648,435,67.1,41,12,105.2
1,2022,Justin Herbert,4739,6.8,699,477,68.2,25,10,93.2
2,2022,Tom Brady,4694,6.4,733,490,66.8,25,9,90.7
3,2022,Kirk Cousins,4547,7.1,643,424,65.9,29,14,92.5
4,2022,Joe Burrow,4475,7.4,606,414,68.3,35,12,100.8


In [549]:
test_data = test_data.drop(columns=['Year', 'Player'])
test_data

Unnamed: 0,Pass Yds,Yds/Att,Att,Cmp,Cmp %,TD,INT,Rate
0,5250,8.1,648,435,67.1,41,12,105.2
1,4739,6.8,699,477,68.2,25,10,93.2
2,4694,6.4,733,490,66.8,25,9,90.7
3,4547,7.1,643,424,65.9,29,14,92.5
4,4475,7.4,606,414,68.3,35,12,100.8


In [550]:
x_test = test_data.drop(columns=['Rate'])  # Example: Dropping 'Year'
x_test = np.hstack([np.ones((test_data.shape[0],1,)),x_test])
x_test.shape

(5, 8)

In [551]:
y_test = test_data['Rate']
y_test.shape

(5,)

In [552]:
A = df.iloc[:,:7]
A.shape

(70, 7)

In [553]:
b = df['Rate']
b.shape

(70,)

In [554]:
A = np.hstack([np.ones((df.iloc[:,:7].shape[0],1)),df.iloc[:,:7]])
A.shape

(70, 8)

In [555]:
x = np.linalg.inv(A.T@A)@A.T@b
x

array([-4.27770064e+01, -6.69029754e-03,  4.52455396e+00,  2.65961280e-01,
       -3.34880718e-01,  1.47996495e+00,  1.40284500e+00, -2.28659449e+00])

In [556]:
x_test.shape

(5, 8)

In [557]:
pred1 = x_test@x
pred1

array([114.8007751 ,  95.59226257,  98.98738212,  94.15009489,
       106.03957719])

In [558]:
error = pred1 - y_test
error
np.square(error)

0    92.174883
1     5.722920
2    68.680702
3     2.722813
4    27.453169
Name: Rate, dtype: float64

In [559]:
mse = np.linalg.norm(error)**2/len(y_test)
mse

39.35089751297715

### Multiple Linear Regression Model

In [561]:
X = df.drop('Rate', axis=1)
y = df['Rate']

In [562]:
model = LinearRegression()
model.fit(X, y)

In [563]:
model.coef_

array([-0.0066903 ,  4.52455396,  0.26596128, -0.33488072,  1.47996495,
        1.402845  , -2.28659449])

In [564]:
model.intercept_

-42.777006364868726

In [565]:
x_test = test_data.drop(columns=['Rate'])  # Example: Dropping 'Year'

In [566]:
y_pred = model.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 39.35089751300326


### Feature Engineering

In [568]:
test_data['Cmp/Att'] = test_data['Cmp'] / test_data['Att']
test_data['TD/Att'] = test_data['TD'] / test_data['Att']

In [569]:
df['TD/Att'] = df['TD'] / df['Att']
df['Cmp/Att'] = df['Cmp'] / df['Att']


In [570]:
X_new = df[['Yds/Att', 'Cmp/Att', 'TD/Att']]
y_new = df['Rate'] 

In [571]:
X_new_test = test_data[['Yds/Att', 'Cmp/Att', 'TD/Att']]
y_new_test = test_data['Rate'] 

In [572]:
X_new_test.shape

(5, 3)

In [573]:
model_new = LinearRegression()
model_new.fit(X_new, y_new)

In [574]:
# Predictions using the new model
predictions_new = model_new.predict(X_new_test)
print("Predictions using new model:", predictions_new)

# MSE for the new model
mse_new = mean_squared_error(y_new_test, predictions_new)
print("MSE for new model:", mse_new)

Predictions using new model: [101.46333041  88.36948459  84.91186976  90.22565899  98.4606243 ]
MSE for new model: 16.28886722823315


#### Because the lower mse, the more accurate it is. Since 16.288 < 39.351, so that the one with fewer features performed better 