In [9]:
#Importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
data = pd.read_csv("BMI.csv")
data.head()

Unnamed: 0,Gender,Height,Weight,Final_Index
0,Male,174,96,4
1,Male,189,87,2
2,Female,185,110,4
3,Female,195,104,3
4,Male,149,61,3


In [11]:
from statsmodels.stats.outliers_influence 
import variance_inflation_factor

# creating dummies for gender
data['Gender'] = data['Gender'].map({'Male':0, 'Female':1})

# the independent variables set
X = data[['Gender', 'Height', 'Weight']]

# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns

# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]
print(vif_data)

  feature        VIF
0  Gender   2.028864
1  Height  11.623103
2  Weight  10.688377


In [12]:
data['Height_m'] = data['Height'] / 100  # Convert cm to meters
data['BMI'] = data['Weight'] / (data['Height_m'] ** 2)

In [13]:
data.head()

Unnamed: 0,Gender,Height,Weight,Final_Index,Height_m,BMI
0,0,174,96,4,1.74,31.708284
1,0,189,87,2,1.89,24.355421
2,1,185,110,4,1.85,32.140248
3,1,195,104,3,1.95,27.350427
4,0,149,61,3,1.49,27.47624


In [14]:
data.drop(columns=['Height', 'Weight','Height_m'], inplace=True)

In [15]:
data.head()

Unnamed: 0,Gender,Final_Index,BMI
0,0,4,31.708284
1,0,2,24.355421
2,1,4,32.140248
3,1,3,27.350427
4,0,3,27.47624


In [16]:
# the independent variables set
X = data[['Gender', 'BMI']]

# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns

# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]
print(vif_data)

  feature      VIF
0  Gender  1.78564
1     BMI  1.78564


In [17]:
x = data[['Gender', 'BMI']]
y = data['Final_Index']

In [18]:
x.head()

Unnamed: 0,Gender,BMI
0,0,31.708284
1,0,24.355421
2,1,32.140248
3,1,27.350427
4,0,27.47624


In [19]:
y.head()

0    4
1    2
2    4
3    3
4    3
Name: Final_Index, dtype: int64

In [20]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42)

In [21]:
from sklearn.linear_model import LinearRegression
mlr = LinearRegression()
mlr.fit(x_train, y_train)

In [22]:
#Prediction of test set
y_pred_mlr= mlr.predict(x_test)
#Predicted values
print("Prediction for test set: {}".format(y_pred_mlr))

Prediction for test set: [3.95639371 2.52757798 3.23646063 3.24333707 3.74560417 6.21577577
 4.39408786 5.61548125 3.10048045 5.68660694 3.58071751 4.60630458
 2.22673231 4.28748281 4.64658897 2.78075607 6.33161748 4.98600748
 2.9463922  5.44469029 2.36389516 6.41450424 2.03677578 3.46446116
 3.82409473 5.45488193 6.36667656 6.62894234 4.34725887 2.43130724
 3.90309304 5.19982435 3.8518054  3.41165148 3.24281473 2.27956206
 4.63176077 2.3509873  3.10911258 2.51410134 4.34898947 2.21082533
 3.72827665 3.06477226 4.88211612 3.22106606 3.26491077 3.20009744
 3.42992509 3.39976661 3.99974606 3.71734213 2.19863382 2.03914226
 6.32669642 2.66664183 2.75362977 6.55687839 7.08860952 3.86687454
 4.18730409 5.38272064 3.98217744 3.96845512 1.87459593 2.5129583
 3.09479935 3.44761368 2.32365286 2.33534902 2.19037729 3.95033709
 2.44252047 3.15287782 6.30056492 4.93407942 5.05044733 3.58695819
 2.67485325 4.91569809 4.45402181 2.97972411 2.51034806 5.1973576
 3.39282006 4.66101446 2.90315371 4.073

In [23]:
#Actual value and the predicted value
mlr_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': y_pred_mlr})
mlr_diff.head()

Unnamed: 0,Actual value,Predicted value
361,4,3.956394
73,2,2.527578
374,4,3.236461
155,4,3.243337
104,4,3.745604


In [24]:
#Model Evaluation
from sklearn import metrics
meanAbErr = metrics.mean_absolute_error(y_test, y_pred_mlr)
meanSqErr = metrics.mean_squared_error(y_test, y_pred_mlr)
rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test, y_pred_mlr))
print('R squared: {:.2f}'.format(mlr.score(x,y)*100))
print('Mean Absolute Error:', meanAbErr)
print('Mean Square Error:', meanSqErr)
print('Root Mean Square Error:', rootMeanSqErr)

R squared: 72.72
Mean Absolute Error: 0.5795644457902391
Mean Square Error: 0.5265655952746275
Root Mean Square Error: 0.725648396452874
