In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
obesity_data = pd.read_csv("obesity_data.csv")

In [4]:
obesity_data.head()

Unnamed: 0,Age,Gender,Height,Weight,BMI,PhysicalActivityLevel,ObesityCategory
0,56,1,173.575262,71.982051,23.891783,4,Normal weight
1,69,1,164.127306,89.959256,33.395209,2,Obese
2,46,0,168.072202,72.930629,25.817737,4,Overweight
3,32,1,168.459633,84.886912,29.912247,3,Overweight
4,60,1,183.568568,69.038945,20.487903,3,Normal weight


In [7]:
obesity_data.isnull().sum()

Age                      0
Gender                   0
Height                   0
Weight                   0
BMI                      0
PhysicalActivityLevel    0
ObesityCategory          0
dtype: int64

In [8]:
obesity_mapping = {
    'Normal weight': 0,
    'Overweight': 1,
    'Obese': 2
}

obesity_data['ObesityCategory'] = obesity_data['ObesityCategory'].map(obesity_mapping)

In [15]:
obesity_data['ObesityCategory'].fillna(-1, inplace=True)

In [16]:
obesity_data.head()

Unnamed: 0,Age,Gender,Height,Weight,BMI,PhysicalActivityLevel,ObesityCategory
0,56,1,173.575262,71.982051,23.891783,4,0.0
1,69,1,164.127306,89.959256,33.395209,2,2.0
2,46,0,168.072202,72.930629,25.817737,4,1.0
3,32,1,168.459633,84.886912,29.912247,3,1.0
4,60,1,183.568568,69.038945,20.487903,3,0.0


In [17]:
obesity_data.describe()

Unnamed: 0,Age,Gender,Height,Weight,BMI,PhysicalActivityLevel,ObesityCategory
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,49.857,0.523,170.052417,71.205769,24.888317,2.534,0.534
std,18.114267,0.499721,10.309971,15.509849,6.193912,1.116284,0.957999
min,18.0,0.0,136.115719,26.06573,8.470572,1.0,-1.0
25%,35.0,0.0,163.514205,61.129629,20.918068,2.0,0.0
50%,50.0,1.0,169.801665,71.929072,24.698647,3.0,0.0
75%,66.0,1.0,177.353596,81.133746,28.732132,4.0,1.0
max,79.0,1.0,201.41967,118.907366,50.791898,4.0,2.0


In [18]:
obesity_data['ObesityCategory'].value_counts()

ObesityCategory
 0.0    371
 1.0    295
 2.0    191
-1.0    143
Name: count, dtype: int64

In [19]:
X = obesity_data.drop(columns='ObesityCategory', axis=1)
Y = obesity_data['ObesityCategory']

In [20]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [21]:
print(X.shape, X_train.shape, X_test.shape)

(1000, 6) (800, 6) (200, 6)


In [22]:
model = LogisticRegression()

In [23]:
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [25]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9725


In [26]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [27]:
print('Accuracy on Test data : ', test_data_accuracy)

Accuracy on Test data :  0.975


In [28]:
#saving the model

In [29]:
import pickle

In [30]:
filename = 'obesity_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [32]:
loaded_model = pickle.load(open('obesity_model.sav', 'rb'))

In [33]:
for column in X.columns:
  print(column) 

Age
Gender
Height
Weight
BMI
PhysicalActivityLevel
