# Linear Regression

Implementing linear regression from scratch using `NumPy`.

In [72]:
import numpy as np
import pandas as pd

In [73]:
class LinearRegression:
    """
    A class which implements linear regression model with gradient descent.
    """
    
    def __init__(self, learning_rate=0.01, iterations=1000):
        """
        Initializes the model.
        """
        self.learning_rate = learning_rate
        self.iterations = iterations
        self.weights, self.bias = None, None
        self.loss = []
    
    @staticmethod
    def _mse(y, y_pred):
        """
        Private method to evaluate mean squared error at each iteration.
        
        :param: y - array, true values
        :param: y_pred - array, predicted values
        :return: float
        """
        return np.mean((y - y_pred)**2)
    
    def fit(self, X, y):
        """
        Fits the model to the data by calculating the weights and bias of the linear regression model.
        
        :param: X - array, features
        :param: y - array, target; true values
        :return: None
        """
        # 1. Initialize weights and bias to random values
        self.weights = np.random.rand(X.shape[1])
        self.bias = np.random.rand()
        
        # 2. Perform gradient descent
        for _ in range(self.iterations):
            # 2.1 Calculate the linear equation and the loss
            y_pred = self.predict(X)
            self.loss.append(self._mse(y, y_pred))
            
            # 2.2 Calculate the derivative of the loss function
            partial_w = (1 / X.shape[0]) * (2 * np.dot(X.T, (y_pred - y)))
            partial_d = (1 / X.shape[0]) * (2 * np.sum(y_pred - y))
            
            # 2.3 Update the weights and bias
            self.weights -= self.learning_rate * partial_w
            self.bias -= self.learning_rate * partial_d
    
    def predict(self, X):
        """
        Predicts the target values for the given features.
        
        :param: X - array, features
        :return: array, predicted values
        """
        return np.dot(X, self.weights) + self.bias

In [74]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

def train(X, y):
    """
    Splits the data into training and testing sets, fits the model to the training data and computes the training and testing correlation score.
    
    :param: X - array, features
    :param: y - array, target; true values
    :return: tuple(float, float), training and testing correlation score
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    return r2_score(y_train, y_pred_train), r2_score(y_test, y_pred)


### Q 1: Consider this Advertising data set , implement linear regression for finding out avereage training error and testinng error.
      
       (A) By selecting Single feature 
       (B) By selecting Two Feature 
       (C) By Selecting Three Feature
and find out in which case training and testing error is minimum.

In [75]:
df = pd.read_excel('./data/Advertising.csv', index_col=0)
df

Unnamed: 0,TV,radio,newspaper,sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9
...,...,...,...,...
196,38.2,3.7,13.8,7.6
197,94.2,4.9,8.1,9.7
198,177.0,9.3,6.4,12.8
199,283.6,42.0,66.2,25.5


In [76]:
from sklearn.preprocessing import MinMaxScaler

X, y = df.values[:, :-1], df.values[:, -1]

scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [77]:
for i in range(3):
    train_error, test_error = train(X[:, 0:i+1], y)
    print(f"Feature(s): {i+1} -> Train r2 score: {train_error}, test r2 score: {test_error}")

Feature(s): 1 -> Train r2 score: 0.5789822422349453, test r2 score: 0.6506105370100064
Feature(s): 2 -> Train r2 score: 0.8679407974495361, test r2 score: 0.8586328379868469
Feature(s): 3 -> Train r2 score: 0.8644103184269314, test r2 score: 0.847398985157306


### Q2. Consider this 50_Startups data set , implement linear regression for finding out avereage training error and testinng error.
      
       (A) By selecting Single feature 
       (B) By selecting Two Feature 
       (C) By Selecting Three Feature
and find out in which case training and testing error is minimum.
and also find out model accuracy by taking at least 10 new data set from user.

In [78]:
df = pd.read_csv('./data/50_Startups.csv')
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [79]:
x = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
x = np.array(ct.fit_transform(x))

In [80]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x = sc.fit_transform(x)

In [81]:
for i in range(4):
    train_error, test_error = train(x[:, 0:i+1], y)
    print(f"Feature(s): {i+1} -> Train r2 score: {train_error}, test r2 score: {test_error}")

Feature(s): 1 -> Train r2 score: 0.04091339431196506, test r2 score: -0.5800030717873517
Feature(s): 2 -> Train r2 score: 0.0409893424738933, test r2 score: -0.5916990774330162
Feature(s): 3 -> Train r2 score: 0.04098934255840314, test r2 score: -0.5917165902672936
Feature(s): 4 -> Train r2 score: 0.9473764571110128, test r2 score: 0.9220234023043321
