## Model Complexity, overfitting, Bias-Variance tradeoff

In [None]:
### Example data

import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression

NUMS = 10

np.random.seed(41)
x = 2 - 3 * np.random.normal(0, 1, NUMS)
y = 4 * x - 2 * (x ** 2) + 0.5 * (x ** 3) + np.random.normal(-25, 25, NUMS)

indSort = np.argsort(x)
x = x[indSort]
y = y[indSort]

plt.scatter(x, y, s=10)
plt.show()

In [None]:
## Linear regression

mdlfit = np.polyfit(x, y, 1)
y_pred = mdlfit[0] * x + mdlfit[1]

print(mdlfit)

plt.scatter(x, y, s=10)
plt.plot(x, y_pred, color='r')
plt.show()

In [None]:
## Linear regression

xx = x.copy()[:, np.newaxis]
yy = y.copy()[:, np.newaxis]

model = LinearRegression()
model.fit(xx, yy)
y_pred = model.predict(xx)

plt.scatter(xx, yy, s=10)
plt.plot(xx, y_pred, color='r')
plt.show()


In [None]:
## Calculate fitting error (root mean squared error)
from sklearn.metrics import mean_squared_error 
rmse = np.sqrt(mean_squared_error(y, y_pred))
print('RMSE:' + str(rmse))

In [None]:
## More complex models (polynomial fits)

mdlfit2 = np.polyfit(x, y, 2)
y_pred2 = mdlfit2[0] * x * x + mdlfit2[1] * x + mdlfit2[2]
#y_pred2 = np.polyval(mdlfit2, x)

print(mdlfit2)

plt.scatter(x, y, s=10)
plt.plot(x, y_pred2, color='r')
plt.show()


In [None]:
## Calculate fitting error (root mean squared error)
rmse = np.sqrt(mean_squared_error(y, y_pred2))
print('RMSE:' + str(rmse))

In [None]:
## More complex models (polynomial fits)

mdlfit3 = np.polyfit(x, y, 3)
y_pred3 = np.polyval(mdlfit3, x)

print(mdlfit3)

plt.scatter(x, y, s=10)
plt.plot(x, y_pred3, color='r')
plt.show()


In [None]:
## Calculate fitting error (root mean squared error)
rmse = np.sqrt(mean_squared_error(y, y_pred3))
print('RMSE:' + str(rmse))

In [None]:
## More complex models (polynomial fits)
mdlfit8 = np.polyfit(x, y, 8)
y_pred8 = np.polyval(mdlfit8, x)

print(mdlfit8)

plt.scatter(x, y, s=10)
plt.plot(x, y_pred8, color='r')
plt.show()


In [None]:
## Calculate fitting error (root mean squared error)
rmse = np.sqrt(mean_squared_error(y, y_pred8))
print('RMSE:' + str(rmse))