In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv("heart.data.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,biking,smoking,heart.disease
0,1,30.801246,10.896608,11.769423
1,2,65.129215,2.219563,2.854081
2,3,1.959665,17.588331,17.177803
3,4,44.800196,2.802559,6.816647
4,5,69.428454,15.974505,4.062224


In [4]:
df.tail()

Unnamed: 0.1,Unnamed: 0,biking,smoking,heart.disease
493,494,47.66044,27.562464,11.294392
494,495,45.097203,21.38562,9.616762
495,496,8.279743,6.42372,13.495168
496,497,42.345863,20.741328,10.115865
497,498,30.774254,23.610175,11.843556


In [5]:
df.drop(labels='Unnamed: 0',axis=1,inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 498 entries, 0 to 497
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   biking         498 non-null    float64
 1   smoking        498 non-null    float64
 2   heart.disease  498 non-null    float64
dtypes: float64(3)
memory usage: 11.8 KB


In [7]:
df.describe()

Unnamed: 0,biking,smoking,heart.disease
count,498.0,498.0,498.0
mean,37.788411,15.435034,10.174538
std,21.481519,8.289959,4.571874
min,1.119154,0.52585,0.551898
25%,20.204598,8.279776,6.513683
50%,35.824459,15.814614,10.385255
75%,57.852786,22.568925,13.724024
max,74.907111,29.946743,20.453496


In [8]:
df.corr()

Unnamed: 0,biking,smoking,heart.disease
biking,1.0,0.015136,-0.935455
smoking,0.015136,1.0,0.309131
heart.disease,-0.935455,0.309131,1.0


In [9]:
mean_of_heart_disease = df['heart.disease'].mean()
mean_of_biking = df['biking'].mean()
mean_of_smoking = df['smoking'].mean()

In [10]:
# for calculating multilinear regression with two independent variables 
# y = B0 + B1*X1 + B2*X2 + E
# where,
# y = predicted value
# B0 = slope
# B1 = regression coefficient for first variable
# B2 = regression coefficient for second variable
# X1 = the value from the data of first variable
# X2 = the value from the data of second variable
# E = model error (how much variation there is in our estimate of y)

In [11]:
# Calculating regression coefficient for biking and smoking i.e B1 and B2

In [12]:
# B1 = (sum of (X2**2))*(sum of (X1 * y))-(sum of (X1*X2))*(sum of (X2*y)) / (sum of (X1**2))*(sum of (X2**2))- (sum of X1*X2)**2
# B2 = (sum of (X1**2))*(sum of (X2 * y))-(sum of (X1*X2))*(sum of (X1*y)) / (sum of (X1**2))*(sum of (X2**2))- (sum of X1*X2)**2

In [13]:
# B0 = mean_of_y - B1*mean_X1 - B2*mean_X2

In [14]:
# let's start with B1
X1_biking = np.array(df['biking'])
X2_smoking = np.array(df['smoking'])
y_heart_disease = np.array(df['heart.disease'])

In [15]:
numerator = (sum(X2_smoking**2)*sum(X1_biking*y_heart_disease)) - (sum(X1_biking*X2_smoking)*sum(X2_smoking*y_heart_disease))
denominator = (sum(X1_biking**2)*sum(X2_smoking**2)) - (sum(X1_biking*X2_smoking))**2 

In [16]:
B1 = numerator/denominator

In [17]:
# let's find out B2
numerator = (sum(X1_biking**2))*(sum(X2_smoking * y_heart_disease))-(sum(X1_biking*X2_smoking))*(sum(X1_biking*y_heart_disease))
denominator = (sum(X1_biking**2))*(sum(X2_smoking**2))-(sum(X1_biking*X2_smoking))**2
B2 = numerator/denominator

In [18]:
B1

-0.03827375916138547

In [19]:
B2

0.6230370992658973

In [20]:
# let's find out B0 i.e the slope
B0 = mean_of_heart_disease - B1*mean_of_biking - B2*mean_of_smoking

In [21]:
pred_y = B0 + B1*X1_biking + B2*X2_smoking

In [24]:
pred_y

array([ 7.61435513,  0.89437385, 12.88742223,  2.03566979,  9.29966463,
       18.19766887,  5.77192288,  9.81781288,  6.95950226, 15.15766879,
        9.01429853, 15.60074965,  7.00563558,  5.14329023,  5.33264537,
        8.98258639, 10.1357705 ,  4.29288242,  8.37430366, 13.8666643 ,
       10.16833609, 17.74646543, 17.84257161, 15.90056752, 11.42413549,
       17.22759105,  2.16922961,  1.60419294,  6.61662704,  4.67832676,
        2.78553872, 16.9740127 , 11.46082549,  9.6098033 ,  7.66812178,
       11.72228306, 14.30069409, 16.17677351, 14.55930472,  3.18903459,
       14.60962241,  2.19234162,  3.81763112,  3.69618927,  9.66038098,
        9.29747749, 13.33000113,  5.98283877,  8.68595411, 12.99677746,
        0.50404274,  8.55017966, 10.14041669, 16.46861693, 14.71042976,
        1.62443977, 15.4977896 ,  3.68194818,  7.79180704,  5.83113704,
        0.98945905,  5.76207781,  6.48490618,  5.02709297,  5.62611446,
       14.1420078 , 10.02691638,  9.44940535, 11.21243604, 16.45