In [151]:
# importing necessary modules

import numpy as np
from sklearn import linear_model, metrics
import matplotlib.pyplot as plt
import pandas as pd

# Question 1. a)

In [152]:

# Artificial data

np.random.seed(4)
X = np.random.rand(250,1)*20  # 250 random numbers chosen from the interval [0,20) <1D input data>
epsilon = np.random.normal(0,3,(250,1))  # noise associated with each output data
y = 4*X + epsilon  # corresponding output data

In [153]:

# training data

X_train = X[ :-50, : ]
y_train = y[ :-50, : ]


# test data

X_test = X[ -50: , : ]
y_test = y[ -50: , : ]

In [154]:

# training the model

lin_reg = linear_model.LinearRegression()
fit = lin_reg.fit(X_train, y_train)

In [155]:

# prediction for the outputs of test data

y_pred = lin_reg.predict(X_test)
pd.DataFrame({'x_test':X_test[:,0], 'y_test':y_test[:,0], 'y_pred':y_pred[:,0]})

Unnamed: 0,x_test,y_test,y_pred
0,4.614919,20.183513,18.78362
1,1.048698,4.035844,4.497476
2,17.174417,68.404299,69.096481
3,6.784498,30.156443,27.47487
4,7.864935,32.822707,31.803056
5,5.40731,20.385817,21.957906
6,2.464558,10.760031,10.169357
7,9.138785,38.640417,36.906051
8,4.694341,22.378891,19.101782
9,11.263065,46.467241,45.415834


In [156]:

# Checking performance of the model

print("\nRoot mean-squared error (RMSE) for test data: ", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("\nCoefficient of Determination (R^2) for test data: ", metrics.r2_score(y_test,y_pred))


Root mean-squared error (RMSE) for test data:  2.2548427173140775

Coefficient of Determination (R^2) for test data:  0.9918023481903316


In [157]:

# Plotting the results

fig = plt.figure(facecolor = "pink")
ax = plt.axes(facecolor=(0.85,0.85,0.85,0.8))
a = ax.scatter(X_train, y_train, marker='.', color="yellow", alpha=0.8, s=50, label="Training data")
b = ax.scatter(X_test, y_test, marker="o", color="blue", s=40, label="Test data")
c = ax.plot(X_test, y_pred, color="red", alpha=0.8, linewidth=2.4, label="Predicted output")
ax.set_title("Linear Regression for 1D Data", fontsize=17)
ax.set_xlabel("X", fontsize=15, color="blue")
ax.set_ylabel("y", fontsize=15, color="blue")
ax.legend(facecolor="pink", shadow=True)
plt.show()


# Displaying the values of slope and intercept of the predicted line

print("Slope of the predicted line: ",lin_reg.coef_[0,0])
print("Intercept of the predicted line: ",lin_reg.intercept_[0])

<IPython.core.display.Javascript object>

Slope of the predicted line:  4.005961152955536
Intercept of the predicted line:  0.2964340404397774


# Question 1. b)

In [158]:

# Artificial data

np.random.seed(5)
x1 = np.linspace(0,10,200).reshape(200,1)  # feature-1 vector
x2 = np.random.beta(1, 2, (200,1))  # feature-2 vector
x3 = np.random.chisquare(5, (200,1))  # feature-3 vector
x4 = np.random.normal(10, 2, (200,1))  # feature-4 vector
X = np.concatenate((x1, x2, x3, x4), axis=1)  # 4D input data points <200 in total>
epsilon = np.random.rand(200,1)  # noise associated with each output data
y = 2*x1 + 6*x2 + 5*x3 + 9*x4 + epsilon  # corresponding output data

In [159]:

# training data

X_train = X[ :-40, : ]
y_train = y[ :-40, : ]


# test data

X_test = X[ -40: , : ]
y_test = y[ -40: , : ]

In [160]:

# training the model

lin_reg = linear_model.LinearRegression()
fit = lin_reg.fit(X_train, y_train)

In [161]:

# prediction for the outputs of test data

y_pred = lin_reg.predict(X_test)
pd.DataFrame({'Feature-1_test':X_test[:,0], 'Feature-2_test':X_test[:,1], 
              'Feature-3_test':X_test[:,2], 'Feature-4_test':X_test[:,3], 
              'y_test':y_test[:,0], 'y_pred':y_pred[:,0]})

Unnamed: 0,Feature-1_test,Feature-2_test,Feature-3_test,Feature-4_test,y_test,y_pred
0,8.040201,0.596753,12.268604,10.802563,178.646575,178.717561
1,8.090452,0.42206,2.545307,8.769214,111.321891,110.881502
2,8.140704,0.455256,5.757572,7.955344,119.755625,119.886036
3,8.190955,0.367947,1.931722,11.043052,127.965069,128.195713
4,8.241206,0.001223,9.596013,9.837546,153.668142,153.519041
5,8.291457,0.156416,1.432434,9.185297,107.582927,107.892443
6,8.341709,0.155473,7.09023,14.166128,181.521963,181.150732
7,8.39196,0.234367,2.258821,10.037547,120.398069,120.369719
8,8.442211,0.057888,14.22518,7.670779,158.260659,157.844787
9,8.492462,0.573064,6.346224,10.84914,150.460198,150.322882


In [162]:
# Displaying optimal weights

print(" Optimal weights:\n","-"*16)
print("Constant term (p0) = ", lin_reg.intercept_[0])
for i in range(X.shape[1]):
    print(f"Weight corresponding to feature-{i+1} (p{i+1}) = {lin_reg.coef_[0,i]}")

 Optimal weights:
 ----------------
Constant term (p0) =  0.38838848871588993
Weight corresponding to feature-1 (p1) = 2.0026345342655216
Weight corresponding to feature-2 (p2) = 5.965508715640061
Weight corresponding to feature-3 (p3) = 4.994445032081478
Weight corresponding to feature-4 (p4) = 9.015710924315925


In [163]:

# Checking performance of the model

print("\nRoot mean-squared error (RMSE) for test data: ", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("\nCoefficient of Determination (R^2) for test data: ", metrics.r2_score(y_test,y_pred))


Root mean-squared error (RMSE) for test data:  0.29511746545830597

Coefficient of Determination (R^2) for test data:  0.9998897527073871
