# Polynomial Regression from Scratch without SKlearn

In [84]:
#importing the required packages
import numpy as np
import pandas as pd

In [85]:
df = pd.read_csv('C:/Users/Nagendra Swamy/Desktop/6000/HousingData.csv')

In [86]:
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


In [87]:
X = df['MEDV']
y = df.drop('MEDV', axis = 1)

In [88]:
#Separating the independent and dependent features
#Dependent feature
y = np.asarray(y.values.tolist()) 

# Independent Feature
X = np.asarray(X.values.tolist())

In [89]:
# Reshaping the independent feature
X = X.reshape(-1,1)
#Reshaping the Dependent features
y = y.reshape(6578,1) 

In [90]:
#Get the shapes of X and y
print("The shape of the independent fatures are ",X.shape)
print("The shape of the dependent fatures are ",y.shape)

The shape of the independent fatures are  (506, 1)
The shape of the dependent fatures are  (6578, 1)


In [91]:
# The method "poly_features" concatenates polynomials of independent feature to X
# This is similar to PolynomialFeatures class from sklearn.preprocessing
def poly_features(features, X):
  data = pd.DataFrame(np.zeros((X.shape[0],features)))
  for i in range(1,features+1):
    data.iloc[:,i-1] = (X**i).reshape(-1,1)
  X_poly = np.array(data.values.tolist())
  return X_poly

In [92]:
# The method "split_data" splits the given dataset into trainset and testset
# This is similar to the method "train_test_split" from "sklearn.model_selection"
def split_data(X,y,test_size=0.2,random_state=0):
    np.random.seed(random_state)                  #set the seed for reproducible results
    indices = np.random.permutation(len(X))       #shuffling the indices
    data_test_size = int(X.shape[0] * test_size)  #Get the test size

    #Separating the Independent and Dependent features into the Train and Test Set
    train_indices = indices[data_test_size:]
    test_indices = indices[:data_test_size]
    X_train = X[train_indices]
    y_train = y[train_indices]
    X_test = X[test_indices]
    y_test = y[test_indices]
    return X_train, y_train, X_test, y_test

In [93]:
class polynomialRegression():

  def __init__(self):
    #No instance Variables required
    pass

  def fit(self,X,y,W):
    """
    Parameters:
    X (array) : Independent Features
    y (array) : Dependent Features/ Target Variable
    W (array) : Weights 

    Returns:
    loss (float) : Calculated Sqaured Error Loss for y and y_pred
    y_pred (array) : Predicted Target Variable
    """
    y_pred = sum(W * X)
    loss = ((y_pred-y)**2)/2    #Loss = Squared Error, we introduce 1/2 for ease in the calculation
    return loss, y_pred

  def updateWeights(self,X,y_pred,y_true,W,alpha,index):
    """
    Parameters:
    X (array) : Independent Features
    y_pred (array) : Predicted Target Variable
    y_true (array) : Dependent Features/ Target Variable
    W (array) : Weights
    alpha (float) : learning rate
    index (int) : Index to fetch the corresponding values of W, X and y 

    Returns:
    W (array) : Update Values of Weight
    """
    for i in range(X.shape[1]):
      #alpha = learning rate, rest of the RHS is derivative of loss function
      W[i] -= (alpha * (y_pred-y_true[index])*X[index][i]) 
    return W

  def train(self, X, y, epochs=10, alpha=0.001, random_state=0):
    """
    Parameters:
    X (array) : Independent Feature
    y (array) : Dependent Features/ Target Variable
    epochs (int) : Number of epochs for training, default value is 10
    alpha (float) : learning rate, default value is 0.001

    Returns:
    y_pred (array) : Predicted Target Variable
    loss (float) : Calculated Sqaured Error Loss for y and y_pred
    """

    num_rows = X.shape[0] #Number of Rows
    num_cols = X.shape[1] #Number of Columns 
    W = np.random.randn(1,num_cols) / np.sqrt(num_rows) #Weight Initialization

    #Calculating Loss and Updating Weights
    train_loss = []
    num_epochs = []
    train_indices = [i for i in range(X.shape[0])]
    for j in range(epochs):
      cost=0
      np.random.seed(random_state)
      np.random.shuffle(train_indices)
      for i in train_indices:
        loss, y_pred = self.forward(X[i],y[i],W[0])
        cost+=loss
        W[0] = self.updateWeights(X,y_pred,y,W[0],alpha,i)
      train_loss.append(cost)
      num_epochs.append(j)
    return W[0], train_loss, num_epochs

  def test(self, X_test, y_test, W_trained):
    """
    Parameters:
    X_test (array) : Independent Features from the Test Set
    y_test (array) : Dependent Features/ Target Variable from the Test Set
    W_trained (array) : Trained Weights
    test_indices (list) : Index to fetch the corresponding values of W_trained,
                          X_test and y_test 

    Returns:
    test_pred (list) : Predicted Target Variable
    test_loss (list) : Calculated Sqaured Error Loss for y and y_pred
    """
    test_pred = []
    test_loss = []
    test_indices = [i for i in range(X_test.shape[0])]
    for i in test_indices:
        loss, y_test_pred = self.forward(X_test[i], W_trained, y_test[i])
        test_pred.append(y_test_pred)
        test_loss.append(loss)
    return test_pred, test_loss
    

  def predict(self, W_trained, X_test):
    prediction = sum(W_trained * X_test)
    return prediction

In [94]:
# Reshaping the independent feature
X = X.reshape(-1,1)

In [95]:
X

array([[24. ],
       [21.6],
       [34.7],
       [33.4],
       [36.2],
       [28.7],
       [22.9],
       [27.1],
       [16.5],
       [18.9],
       [15. ],
       [18.9],
       [21.7],
       [20.4],
       [18.2],
       [19.9],
       [23.1],
       [17.5],
       [20.2],
       [18.2],
       [13.6],
       [19.6],
       [15.2],
       [14.5],
       [15.6],
       [13.9],
       [16.6],
       [14.8],
       [18.4],
       [21. ],
       [12.7],
       [14.5],
       [13.2],
       [13.1],
       [13.5],
       [18.9],
       [20. ],
       [21. ],
       [24.7],
       [30.8],
       [34.9],
       [26.6],
       [25.3],
       [24.7],
       [21.2],
       [19.3],
       [20. ],
       [16.6],
       [14.4],
       [19.4],
       [19.7],
       [20.5],
       [25. ],
       [23.4],
       [18.9],
       [35.4],
       [24.7],
       [31.6],
       [23.3],
       [19.6],
       [18.7],
       [16. ],
       [22.2],
       [25. ],
       [33. ],
       [23.5],
       [19

In [96]:
X = poly_features(2,X)

In [98]:
#Adding the feature X0 = 1, so we have the equation: y =  W0 + (W1 * X1) + (W2 * (X1**2))
X = np.concatenate((X,np.ones((506,1))), axis = 1)

In [99]:
X

array([[2.40000e+01, 5.76000e+02, 1.00000e+00],
       [2.16000e+01, 4.66560e+02, 1.00000e+00],
       [3.47000e+01, 1.20409e+03, 1.00000e+00],
       ...,
       [2.39000e+01, 5.71210e+02, 1.00000e+00],
       [2.20000e+01, 4.84000e+02, 1.00000e+00],
       [1.19000e+01, 1.41610e+02, 1.00000e+00]])

In [100]:
y

array([[6.320e-03],
       [1.800e+01],
       [2.310e+00],
       ...,
       [2.100e+01],
       [3.969e+02],
       [7.880e+00]])

In [101]:
#Splitting the dataset
X_train, y_train, X_test, y_test = split_data(X,y,test_size=0.2,random_state=7844)

In [102]:
# model fitting
model=polynomialRegression()
model.fit(X_train, y_train,1)

(array([[3.93926644e+07, 2.92669641e+10, 6.72916005e+03],
        [4.19444465e+07, 2.93354672e+10, 7.95957121e+04],
        [4.19347567e+07, 2.93352109e+10, 7.91741424e+04],
        ...,
        [           nan,            nan,            nan],
        [4.19628766e+07, 2.93359545e+10, 8.04005000e+04],
        [4.19628766e+07, 2.93359545e+10, 8.04005000e+04]]),
 array([  9165.1 , 242226.85,    405.  ]))

In [103]:
predictions = model.predict(1, X_test)

In [127]:
from sklearn.metrics import r2_score
r2score = r2_score(y_test, predictions)

ValueError: Found input variables with inconsistent numbers of samples: [101, 3]

In [116]:
import time
start_time = time.time()
end_time = time.time()
elapsed_time = end_time - start_time

In [118]:
N = X_train.shape[0]  # Number of samples in the training set

d = X_train.shape[1]  # Number of features

time_complexity = elapsed_time / (N**2 * d)

In [124]:
print("Time complexity of fitting a polynomial regression model on the Boston Housing dataset is {:.2f}".format(time_complexity))

Time complexity of fitting a polynomial regression model on the Boston Housing dataset is 0.00
