# **Implementing Linear Regression from scratch and comparing with sklearn Linear regression**

Dataset used : https://www.kaggle.com/datasets/parvmodi/cgpa-vs-package-in-lpa?resource=download

In [4]:
import pandas as pd

In [5]:
df = pd.read_csv("/content/placement.csv")
df.head()

Unnamed: 0,cgpa,package
0,6.89,3.26
1,5.12,1.98
2,7.82,3.25
3,7.42,3.67
4,6.94,3.57


In [52]:
df.describe()


Unnamed: 0,cgpa,package
count,200.0,200.0
mean,6.9905,2.99605
std,1.069409,0.691644
min,4.26,1.37
25%,6.19,2.4875
50%,6.965,2.995
75%,7.7375,3.4925
max,9.58,4.62


In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   cgpa     200 non-null    float64
 1   package  200 non-null    float64
dtypes: float64(2)
memory usage: 3.3 KB


## Implementing Linear Regression Using ScikitLearn

In [54]:
x = df.iloc[: ,0].values
y = df.iloc[: ,1].values
x,y

(array([6.89, 5.12, 7.82, 7.42, 6.94, 7.89, 6.73, 6.75, 6.09, 8.31, 5.32,
        6.61, 8.94, 6.93, 7.73, 7.25, 6.84, 5.38, 6.94, 7.48, 7.28, 6.85,
        6.14, 6.19, 6.53, 7.28, 8.31, 5.42, 5.94, 7.15, 7.36, 8.1 , 6.96,
        6.35, 7.34, 6.87, 5.99, 5.9 , 8.62, 7.43, 9.38, 6.89, 5.95, 7.66,
        5.09, 7.87, 6.07, 5.84, 8.63, 8.87, 9.58, 9.26, 8.37, 6.47, 6.86,
        8.2 , 5.84, 6.6 , 6.92, 7.56, 5.61, 5.48, 6.34, 9.16, 7.36, 7.6 ,
        5.11, 6.51, 7.56, 7.3 , 5.79, 7.47, 7.78, 8.44, 6.85, 6.97, 6.94,
        8.99, 6.59, 7.18, 7.63, 6.1 , 5.58, 8.44, 4.26, 4.79, 7.61, 8.09,
        4.73, 6.42, 7.11, 6.22, 7.9 , 6.79, 5.83, 6.63, 7.11, 5.98, 7.69,
        6.61, 7.95, 6.71, 5.13, 7.05, 7.62, 6.66, 6.13, 6.33, 7.76, 7.77,
        8.18, 5.42, 8.58, 6.94, 5.84, 8.35, 9.04, 7.12, 7.4 , 7.39, 5.23,
        6.5 , 5.12, 5.1 , 6.06, 7.33, 5.91, 6.78, 7.93, 7.29, 6.68, 6.37,
        5.84, 6.05, 7.2 , 6.1 , 5.64, 7.14, 7.91, 7.19, 7.91, 6.76, 6.93,
        4.85, 6.17, 5.84, 6.07, 5.66, 

In [55]:
#train test split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x, y ,test_size=0.2,random_state=2)

In [56]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [57]:
print(x_train.shape)
print("-----------")
print(x_train.reshape(-1, 1).shape)
print(y_train.shape)

(160,)
-----------
(160, 1)
(160,)


In [58]:
lr.fit(x_train.reshape(-1, 1) , y_train )

In [59]:
lr.predict(x_test.reshape(-1, 1))

array([3.89111601, 3.09324469, 2.38464568, 2.57434935, 1.6537286 ,
       1.77647803, 2.07219258, 2.93143862, 3.76278706, 2.93701814,
       4.09197872, 3.51170867, 2.97049525, 2.40138424, 3.18809652,
       3.46707251, 1.94386362, 3.24389172, 2.97607477, 3.41685683,
       2.55761079, 3.16577844, 2.85890486, 3.12114229, 3.68467378,
       2.8700639 , 3.49497011, 3.34432308, 3.91901361, 1.96060218,
       3.65119666, 3.2104146 , 3.74046898, 2.7863711 , 2.78079158,
       3.27178932, 3.52844723, 2.61340599, 2.65804215, 2.71383735])

In [60]:
print(y_test)

[4.1  3.49 2.08 2.33 1.94 1.48 1.86 3.09 4.21 2.87 3.65 4.   2.89 2.6
 2.99 3.25 1.86 3.67 2.37 3.42 2.48 3.65 2.6  2.83 4.08 2.56 3.58 3.81
 4.09 2.01 3.63 2.92 3.51 1.94 2.21 3.34 3.34 3.23 2.01 2.61]


In [70]:
lr.predict([[10]])

array([4.68340781])

## From scratch

In [62]:
class custom_lr :
  def __init__(self):
    self.m=None
    self.b=None

  def fit(self , x_train , y_train):
    numerator = 0
    denominator = 0
    for i in range (x_train.shape[0]):
      numerator = numerator + ((x_train[i]- x_train.mean() )* (y_train[i] - y_train.mean()))
      denominator = denominator + ((x_train[i] - x_train.mean())**2)
    self.m = numerator /  denominator
    self.b = y_train.mean() -  self.m * x_train.mean()
    print(f"m = {self.m}" )
    print(f"b = {self.b}" )

  def predict(self,x_test):
    # y_predicted = mx+b
    return self.m * x_test + self.b


In [63]:
c_lr = custom_lr()

In [64]:
c_lr.fit(x_train , y_train)

m = 0.5579519734250721
b = -0.8961119222429152


In [65]:
c_lr.predict(x_test)

array([3.89111601, 3.09324469, 2.38464568, 2.57434935, 1.6537286 ,
       1.77647803, 2.07219258, 2.93143862, 3.76278706, 2.93701814,
       4.09197872, 3.51170867, 2.97049525, 2.40138424, 3.18809652,
       3.46707251, 1.94386362, 3.24389172, 2.97607477, 3.41685683,
       2.55761079, 3.16577844, 2.85890486, 3.12114229, 3.68467378,
       2.8700639 , 3.49497011, 3.34432308, 3.91901361, 1.96060218,
       3.65119666, 3.2104146 , 3.74046898, 2.7863711 , 2.78079158,
       3.27178932, 3.52844723, 2.61340599, 2.65804215, 2.71383735])

In [68]:
c_lr.predict(10)

np.float64(4.683407812007806)

##  comparing custom and sklearn LR

custom LR coff :

m = 0.5579519734250721  
b = -0.8961119222429152

Sklearn lr constants

In [66]:
lr.coef_    #m

array([0.55795197])

In [67]:
lr.intercept_ #b

np.float64(-0.8961119222429144)