# Linear Regression

## Importing packages

In [2]:
# Packages needed

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
# from sklearn.metrics import r2_score

## Data importing

In [3]:
# Read and Convert csv file to dataframe

insurance_data = pd.read_csv("insurance.csv")

insurance_data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


## Get a closer look at the dataset

In [4]:
insurance_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


Linear regression model deals with numerical input only, there are 3 non numerical features.

## Preprocessing

Dealing with the 3 non numerical features.

1. Sex

In [5]:
# how many different values are in the sex column

insurance_data["sex"].unique()

array(['female', 'male'], dtype=object)

In [6]:
# since there are 2 unique values
# assign 'male' to 1 and 'female' to 2
insurance_data["sex"] = insurance_data["sex"].map({'male': 1, 'female': 2})

insurance_data["sex"] 

0       2
1       1
2       1
3       1
4       1
       ..
1333    1
1334    2
1335    2
1336    2
1337    2
Name: sex, Length: 1338, dtype: int64

2. Smoker

In [7]:
# how many different values are in the smoker column

insurance_data["smoker"].unique()

array(['yes', 'no'], dtype=object)

In [8]:
# assign 'yes to 1 and 'no to 0

insurance_data["smoker"]= insurance_data["smoker"].map({'yes': 1, 'no': 0})

insurance_data["smoker"]

0       1
1       0
2       0
3       0
4       0
       ..
1333    0
1334    0
1335    0
1336    0
1337    1
Name: smoker, Length: 1338, dtype: int64

3. Region

In [9]:
# how many different values are in the smoker column

insurance_data["region"].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [10]:
# assign 'southwest'to 1, 'southeast to 2,
#        'northwest'to 3, 'northeast'to 4

insurance_data["region"]= insurance_data["region"].map({'southwest': 1, 'southeast': 2,'northwest': 3, 'northeast': 4})

insurance_data["region"]

0       1
1       2
2       2
3       3
4       3
       ..
1333    3
1334    4
1335    2
1336    1
1337    3
Name: region, Length: 1338, dtype: int64

## Relook at the data

In [11]:
insurance_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   int64  
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   int64  
 5   region    1338 non-null   int64  
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(5)
memory usage: 73.3 KB


## Features and Label

In [12]:
# Get features from data

X = insurance_data.drop("charges", axis=1)

X 

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,2,27.900,0,1,1
1,18,1,33.770,1,0,2
2,28,1,33.000,3,0,2
3,33,1,22.705,0,0,3
4,32,1,28.880,0,0,3
...,...,...,...,...,...,...
1333,50,1,30.970,3,0,3
1334,18,2,31.920,0,0,4
1335,18,2,36.850,0,0,2
1336,21,2,25.800,0,0,1


In [13]:
# Get labels from data

y = insurance_data["charges"]

y

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64

In [14]:
# Data split to training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=0)

## SKLEARN

In [15]:
# Get the linear regresion model

lin_reg_sktl = LinearRegression().fit(X_train , y_train)

In [16]:
# predict

predict_sktl = lin_reg_sktl.predict(X_test)

In [21]:
score_sktl = lin_reg_sktl.score(X_test,y_test)
score_sktl

0.799874714544996

## From Scratch

In [22]:
class LinearRegression_scratch() :
    """
    Linear regression model for predicting target variables.
    """

    def __init__( self, learning_rate, iterations) :
        """
        Initialize the LinearRegression object.

        Parameters:
        - learning_rate (float): The learning rate for gradient descent optimization.
        - iterations (int): The number of iterations for gradient descent optimization.
        """
        self.learning_rate = learning_rate
        self.iterations = iterations

    def fit( self, X, Y ) :
        """
        Fit the linear regression model .

        Parameters:
        - X : The input features of the training data.
        - Y : The label values of the training data.

        Returns:
        - self : The LinearRegression object after fitting.
        """
        self.m, self.n = X.shape       # Get the number of samples (m) and features (n).
        self.W = np.zeros( self.n )    # weight initialization, initialize w to 0
        self.b = 0                     # initialize bias (b) to 0
        self.X = X                     # initialize the training set
        self.Y = Y                     # initialize the target values

        # Gradient descent 
        for i in range( self.iterations ) :
            self.update_weights()

        return self


    def update_weights( self ) :
        """
        Update the weights and bias of the linear regression model using gradient descent.
        """
        Y_pred = self.predict( self.X )

        # Calculate gradients
        
        dW = ( ( self.X.T ).dot( Y_pred - self.Y  )  ) / self.m 
       
        db =  np.sum( Y_pred - self.Y ) / self.m  

        # Update weights
        self.W = self.W - self.learning_rate * dW
        self.b = self.b - self.learning_rate * db
        
        return self
        
    def predict( self, X ) :
        """
        Predict the target values for the input features.
        
        Parameters:
        - X : The input features.
        
        Returns:
        - y_pred : The predicted target values.
        """
        y_pred =  X.dot( self.W ) + self.b
        return y_pred   
    
    def score(self,y_test,y_pred):
        """
        Calculate the R^2 score of the model

        Parameters:
        - y_test : The true target values.
        - y_pred : The predicted target values.

        Returns:
        - r2_score (float): The R^2 score of the linear regression model.
        """
        ss_total = np.sum((y_test - np.mean(y_test)) ** 2)
        ss_residual = np.sum((y_test - y_pred) ** 2)
        r2_score = 1 - (ss_residual / ss_total)
        return r2_score



In [23]:
# create a linear regression object

lin_reg_scratch = LinearRegression_scratch( learning_rate=0.0001, iterations= 1000)

In [24]:
# fitting the model

lin_reg_scratch.fit(X_train,y_train)


<__main__.LinearRegression_scratch at 0x7f8765fb0c10>

In [25]:
# predict

predict_scratch = lin_reg_scratch.predict(X_test)


In [26]:
# evaluate the model

score_scratch = lin_reg_scratch.score(y_test , predict_scratch)
score_scratch

0.16524711358013688

## Comparing the results

In [27]:
print("The Evaluation scores :" )
print("The sckit learn model r2 score =" , score_sktl)
print("The implemented mode; r2 score =" , score_scratch)

The Evaluation scores :
The sckit learn model r2 score = 0.799874714544996
The implemented mode; r2 score = 0.16524711358013688


The implemented model's performance is much less than the model from sckit learn.

The performance can be improved through different data preproccesing techniques and transforms.