# Supervised Learning- Linear Regression

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

## Overview
Linear regression is a machine learning, classification algorithm that is used to predict label for continuous data.
<br>
The algortihm consists of sketching a line of best fit in the data, and predictions are made on its basis. This line is sketched by the equation
$$y = mx + c$$
where, for a point, $y$ is the y-coordinate of the point, $x$ is the x-coordinate, $m$ is the slope of the line at that point, and $c$ is the y-intercept of the line. Since, $y$ is the dependent variable, and $x$ is the independent variable, our goal effectively becomes to calculate the value of $m$ and $c$.
<br>
The cost function for our purposes is
$$
f(y) = {1 \over 2n} \sum_{i=1}^n (\hat{y_i} - (mx_i + c))^2
$$
The derivative of this function with respect to $m$ gives us
$$
{dy \over dm} = {-2 \over n} \sum_{i=1}^n (x_i(y_i - (mx_i + c)))
$$
The derivative of this function with respect to $c$ gives us
$$
{dy \over dc} = {2 \over n} \sum_{i=1}^n (y_i(mx_i + c))
$$
This combined give us the descent gradient, which is used to see when our cost function has reached the lowest value.
<br>
But for this to happen, we need a number of iterations, and a learning rate, which will be 1000 and 0.001 respectively.

In [67]:
class LinearRegression:

    def __init__(self, lr = 0.001, n_iters=1000):
        self.lr = lr
        self.n_iters = n_iters
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        for _ in range(self.n_iters):
            y_pred = np.dot(X, self.weights) + self.bias

            dw = (-2/n_samples) * np.dot(X.T, (y_pred-y))
            db = (2/n_samples) * np.sum(y_pred-y)
            try:
                cost = (1/n_samples) * sum([val**2 for val in (y - y_pred)])
            except OverflowError:
                pass

            self.weights = self.weights - self.lr * dw
            self.bias = self.bias - self.lr * db
            print(f"m = {np.sum(self.weights)}\nc = {self.bias}\ncost = {cost}")

    def predict(self, X):
        y_pred = np.dot(X, self.weights) + self.bias
        self.predictions = y_pred
        return y_pred
    
    def error(self, y_test):
        """Returns the Mean Squared Error between the label and the predictions"""
        # return np.mean((y_test - self.predictions)**2)
        total = 0
        for i, pred in enumerate(self.predictions):
            total += ((y_test[i] - pred)**2)
        total /= len(self.predictions)
        return total

## Medical Price Exercise

In [3]:
df = pd.read_csv('./Medical Price Dataset.csv')
print(df.columns)
print(df.describe())
df.head()

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')
               age          bmi     children       charges
count  1338.000000  1338.000000  1338.000000   1338.000000
mean     39.207025    30.663397     1.094918  13270.422265
std      14.049960     6.098187     1.205493  12110.011237
min      18.000000    15.960000     0.000000   1121.873900
25%      27.000000    26.296250     0.000000   4740.287150
50%      39.000000    30.400000     1.000000   9382.033000
75%      51.000000    34.693750     2.000000  16639.912515
max      64.000000    53.130000     5.000000  63770.428010


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


### Dropping Categorical Columns

In [4]:
df.drop(['sex', 'smoker', 'region'], axis=1, inplace=True)

In [5]:
df

Unnamed: 0,age,bmi,children,charges
0,19,27.900,0,16884.92400
1,18,33.770,1,1725.55230
2,28,33.000,3,4449.46200
3,33,22.705,0,21984.47061
4,32,28.880,0,3866.85520
...,...,...,...,...
1333,50,30.970,3,10600.54830
1334,18,31.920,0,2205.98080
1335,18,36.850,0,1629.83350
1336,21,25.800,0,2007.94500


### Training and Testing

In [22]:
y = df['charges']
X = df.drop(['charges'], axis=1)

#### Splitting into train and test split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.05, random_state = 0)

In [68]:
lin_reg = LinearRegression(lr=0.001, n_iters=50)
lin_reg.fit(X_train, y_train)
lin_reg.predict(X_test);

m = -1997.6875126271243
c = 26.320657074412278
cost = 317759920.1907936
m = -14461.230176193372
c = 192.5127291953005
cost = 7484885535.309011
m = -92257.89708794827
c = 1231.835824372884
cost = 286781723381.0295
m = -577898.1759061367
c = 7721.681040359379
cost = 11170748070921.045
m = -3609513.7624146147
c = 48236.54227704957
cost = 435309903524658.94
m = -22534454.324109565
c = 301153.5297812838
cost = 1.6963661564805788e+16
m = -140673940.99747428
c = 1879996.9728749662
cost = 6.610598387979186e+17
m = -878163136.922162
c = 11735975.103247227
cost = 2.5760954494254445e+19
m = -5481961031.43693
c = 73262208.88407895
cost = 1.0038830641788327e+21
m = -34221304211.209347
c = 457341526.8020108
cost = 3.912049170284602e+22
m = -213627495706.95203
c = 2854967919.1993256
cost = 1.5244931662678185e+24
m = -1333575900493.967
c = 17822220896.03837
cost = 5.940823626785372e+25
m = -8324886618034.408
c = 111255735961.15067
cost = 2.3150897718337883e+27
m = -51968348531421.25
c = 694517190205.6

In [69]:
lin_reg.predictions

array([-9.40310049e+43, -8.69246187e+43, -9.86680679e+43, -1.13027348e+44,
       -8.13965987e+43, -6.29696634e+43, -4.73158343e+43, -9.86003942e+43,
       -7.63761730e+43, -6.83089708e+43, -6.08522863e+43, -8.78178725e+43,
       -8.23534393e+43, -6.44414309e+43, -6.07519905e+43, -8.96605593e+43,
       -9.86438119e+43, -6.32407935e+43, -7.54570961e+43, -5.39255976e+43,
       -8.77548434e+43, -1.02506667e+44, -9.46279544e+43, -8.54551076e+43,
       -5.93365339e+43, -7.53593403e+43, -5.15601505e+43, -8.15118922e+43,
       -6.32427154e+43, -8.54741414e+43, -7.74909239e+43, -1.13102226e+44,
       -1.13379476e+44, -1.04421317e+44, -4.95079071e+43, -6.40194431e+43,
       -9.39008597e+43, -6.94214048e+43, -8.12691963e+43, -5.63652338e+43,
       -6.40844143e+43, -6.27623632e+43, -7.39990593e+43, -1.07964250e+44,
       -5.71443497e+43, -5.71187900e+43, -8.93526894e+43, -7.77424426e+43,
       -5.84685861e+43, -9.45601692e+43, -6.16245118e+43, -5.01872943e+43,
       -8.69129012e+43, -

In [66]:
y_test

578      9724.53000
610      8547.69130
569     45702.02235
1034    12950.07120
198      9644.25250
           ...     
435     13919.82290
1144     9630.39700
390     10736.87075
483      9880.06800
503     32548.34050
Name: charges, Length: 67, dtype: float64