# import dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('weight-height.csv')

In [3]:
df.head(10)

Unnamed: 0,Gender,Height,Weight
0,Male,73.847017,241.893563
1,Male,68.781904,162.310473
2,Male,74.110105,212.740856
3,Male,71.730978,220.04247
4,Male,69.881796,206.349801
5,Male,67.253016,152.212156
6,Male,68.785081,183.927889
7,Male,68.348516,167.971111
8,Male,67.01895,175.92944
9,Male,63.456494,156.399676


In [4]:
df.shape

(8555, 3)

In [5]:
df.describe()

Unnamed: 0,Height,Weight
count,8555.0,8555.0
mean,66.809925,165.632735
std,3.851454,32.043922
min,54.616858,65.78
25%,63.957684,139.876803
50%,66.985923,168.521567
75%,69.604427,190.666305
max,80.45,269.989698


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8555 entries, 0 to 8554
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Gender  8555 non-null   object 
 1   Height  8555 non-null   float64
 2   Weight  8555 non-null   float64
dtypes: float64(2), object(1)
memory usage: 200.6+ KB


In [7]:
df.isnull().sum()

Gender    0
Height    0
Weight    0
dtype: int64

In [8]:
df.corr()

Unnamed: 0,Height,Weight
Height,1.0,0.922975
Weight,0.922975,1.0


# Scaling with Normalization


In [9]:
df1 = df.copy()

In [10]:
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

In [11]:
from sklearn.preprocessing import MinMaxScaler
norm_scaler = MinMaxScaler()

In [12]:
df1.columns

Index(['Gender', 'Height', 'Weight'], dtype='object')

# Fit and Transform


In [13]:
height = norm_scaler.fit(df1[['Height']])


In [14]:
height

In [15]:
df.head()

Unnamed: 0,Gender,Height,Weight
0,Male,73.847017,241.893563
1,Male,68.781904,162.310473
2,Male,74.110105,212.740856
3,Male,71.730978,220.04247
4,Male,69.881796,206.349801


# Lable Encoding


In [16]:
from sklearn.preprocessing import LabelEncoder

In [17]:
label_encoder = LabelEncoder()

In [18]:
label_encoder

In [19]:
df1.head()

Unnamed: 0,Gender,Height,Weight
0,Male,73.847017,241.893563
1,Male,68.781904,162.310473
2,Male,74.110105,212.740856
3,Male,71.730978,220.04247
4,Male,69.881796,206.349801


In [20]:
df1.Gender = label_encoder.fit_transform(df1.Gender)

In [21]:
df1.head()

Unnamed: 0,Gender,Height,Weight
0,1,73.847017,241.893563
1,1,68.781904,162.310473
2,1,74.110105,212.740856
3,1,71.730978,220.04247
4,1,69.881796,206.349801


# Seperate x(Gender, Height) and y (y=Weight)


In [22]:
x = df1.drop('Weight', axis = 'columns')
x

Unnamed: 0,Gender,Height
0,1,73.847017
1,1,68.781904
2,1,74.110105
3,1,71.730978
4,1,69.881796
...,...,...
8550,0,60.483946
8551,0,63.423372
8552,0,65.584057
8553,0,67.429971


In [23]:
y = df1['Weight']
y

0       241.893563
1       162.310473
2       212.740856
3       220.042470
4       206.349801
           ...    
8550    110.565497
8551    129.921671
8552    155.942671
8553    151.678405
8554    131.253738
Name: Weight, Length: 8555, dtype: float64

# Split the dataset into Training and testing


In [24]:
x_train, x_test, y_train, y_test = tts(x,y,test_size=.30,random_state=23)

In [25]:
x_train.head()

Unnamed: 0,Gender,Height
4478,1,65.566101
7317,0,65.946674
712,1,70.213629
7690,0,65.939828
477,1,67.459715


In [26]:
x_train.shape

(5988, 2)

In [27]:
x_test.shape

(2567, 2)

# Applying Linear Regressing


In [28]:
lin_reg = LinearRegression()
lin_reg

In [29]:
lin_reg.fit(x_train, y_train)

In [30]:
lin_pred = lin_reg.predict(x_test)
lin_pred

array([115.75159375, 120.81605161, 198.24315645, ..., 151.22122947,
       187.48938935, 141.87610393])

In [31]:
lin_reg.predict([[0, .74]])

array([-242.82970253])

In [32]:
lin_reg.predict([[1, .74]])

array([-223.40010869])

# Evaluate Model (Accuracy, MSE, MAE)


In [33]:
accuracy = r2_score(y_test, lin_pred)
accuracy

0.8948968858708215

In [34]:
mse = mean_squared_error(y_test, lin_pred)
mse

106.33493434941664

In [35]:
mae = mean_absolute_error(y_test, lin_pred)
mae

8.059548582120026

# Applying KNN Regressor:


In [36]:
from sklearn.neighbors import KNeighborsRegressor


In [37]:
neigh = KNeighborsRegressor(n_neighbors=5)
neigh.fit(x_train, y_train)

In [38]:
knn_pred = neigh.predict(x_test)
knn_pred

array([113.04417562, 111.0063328 , 197.97238004, ..., 149.11994142,
       185.0310776 , 140.98552804])

In [39]:
neigh.predict([[0,.74]])


array([88.88340723])

In [40]:
neigh.predict([[1,.74]])


array([88.88340723])

# Evaluate Model (Accuracy, MSE, MAE)


In [41]:
accuracy = r2_score(y_test, knn_pred)
accuracy

0.8724050852671364

In [42]:
mse = mean_squared_error(y_test, knn_pred)
mse

129.09034136479315

In [77]:
mae = mean_absolute_error(y_test, knn_pred)
mae

8.891601248747953