In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('weight-height - weight-height.csv')
df

Unnamed: 0,Gender,Height,Weight
0,Male,73.847017,241.893563
1,Male,68.781904,162.310473
2,Male,74.110105,212.740856
3,Male,71.730978,220.042470
4,Male,69.881796,206.349801
...,...,...,...
8550,Female,60.483946,110.565497
8551,Female,63.423372,129.921671
8552,Female,65.584057,155.942671
8553,Female,67.429971,151.678405


In [3]:
df.head()

Unnamed: 0,Gender,Height,Weight
0,Male,73.847017,241.893563
1,Male,68.781904,162.310473
2,Male,74.110105,212.740856
3,Male,71.730978,220.04247
4,Male,69.881796,206.349801


In [4]:
df.tail()

Unnamed: 0,Gender,Height,Weight
8550,Female,60.483946,110.565497
8551,Female,63.423372,129.921671
8552,Female,65.584057,155.942671
8553,Female,67.429971,151.678405
8554,Female,60.921791,131.253738


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8555 entries, 0 to 8554
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Gender  8555 non-null   object 
 1   Height  8555 non-null   float64
 2   Weight  8555 non-null   float64
dtypes: float64(2), object(1)
memory usage: 200.6+ KB


In [6]:
df.describe()

Unnamed: 0,Height,Weight
count,8555.0,8555.0
mean,66.809925,165.632735
std,3.851454,32.043922
min,54.616858,65.78
25%,63.957684,139.876803
50%,66.985923,168.521567
75%,69.604427,190.666305
max,80.45,269.989698


In [7]:
df.isnull().sum()

Gender    0
Height    0
Weight    0
dtype: int64

In [8]:
df.shape

(8555, 3)

In [9]:
df = pd.get_dummies(df, columns=['Gender'], drop_first=True)   # Convert 'Gender' to numeric using one-hot encoding

In [10]:
x = df[['Height', 'Gender_Male']] 
y = df['Weight'] 

In [11]:
x.head()

Unnamed: 0,Height,Gender_Male
0,73.847017,True
1,68.781904,True
2,74.110105,True
3,71.730978,True
4,69.881796,True


In [12]:
y.head()

0    241.893563
1    162.310473
2    212.740856
3    220.042470
4    206.349801
Name: Weight, dtype: float64

Linear Regression Model

In [13]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3, random_state=42)

In [14]:
xtrain.shape

(5988, 2)

In [15]:
ytrain.shape

(5988,)

In [16]:
ytrain.shape

(5988,)

In [17]:
reg = LinearRegression()

In [18]:
reg.fit(xtrain, ytrain)

In [19]:
ytrain_pred = reg.predict(xtrain)

In [20]:
ytrain_pred

array([178.35427565, 202.22486558, 129.22191775, ..., 139.34406368,
       187.17224616, 158.78183812])

In [21]:
mse = mean_squared_error(ytrain, ytrain_pred)
print(f"MSE: {mse}")

MSE: 105.23806184862872


Evaluate Linear Regression Model

In [22]:
train_score = reg.score(xtrain, ytrain)   # train accuracy.

In [23]:
print(f"train_accuracy: {train_score}")

train_accuracy: 0.8973793060969246


In [24]:
test_score = reg.score(xtest, ytest)     # test accuracy.

In [25]:
print(f"test_accuracy: {test_score}")

test_accuracy: 0.905911242442266


K-NN for Regression

In [26]:
knn = KNeighborsRegressor(n_neighbors=3)

In [27]:
knn.fit(x,y)

In [28]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3, random_state=42)

In [29]:
train_score = reg.score(xtrain, ytrain)

In [30]:
print(f"Training Accuracy: {train_score}")

Training Accuracy: 0.8973793060969246


In [31]:
test_score = reg.score(xtest, ytest)

In [32]:
print(f"Testing Accuracy: {test_score}")

Testing Accuracy: 0.905911242442266


In [33]:
pred_weight = knn.predict(xtest) 

In [34]:
pred_weight

array([144.58150493, 179.2830548 , 187.4331135 , ...,  92.71668391,
       185.09988813, 139.24252513])

In [35]:
mse_knn = mean_squared_error(ytest, pred_weight)
print(f"MSE for K-NN: {mse_knn}")

MSE for K-NN: 67.5371533691142


Compare K-NN & Linear Regression Model

In [36]:
print("\n1. Linear Regression Models:")
print(f"Training Accuracy: {train_score}")
print(f"Testing Accuracy: {test_score}")
print(f"MSE: {mse}\n")

print("\n2. K-NN Regressorn Models:")
print(f"Training Accuracy: {train_score}")
print(f"Testing Accuracy: {test_score}")
print(f"MSE: {mse_knn}")


1. Linear Regression Models:
Training Accuracy: 0.8973793060969246
Testing Accuracy: 0.905911242442266
MSE: 105.23806184862872


2. K-NN Regressorn Models:
Training Accuracy: 0.8973793060969246
Testing Accuracy: 0.905911242442266
MSE: 67.5371533691142
