# Import Library

In [114]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

## Read File

In [58]:
data = pd.read_csv('data.csv')
data

Unnamed: 0,ID,Age,Gender,Height,Weight,BMI,Label
0,1,25.0,,175.0,80,25.3,Normal Weight
1,2,30.0,Female,160.0,60,22.5,Normal Weight
2,3,35.0,Male,180.0,90,27.3,Overweight
3,4,40.0,Female,150.0,50,20.0,Underweight
4,5,45.0,Male,190.0,100,31.2,Obese
...,...,...,...,...,...,...,...
105,108,21.0,Male,180.0,15,5.6,Underweight
106,109,26.0,Female,150.0,15,5.6,Underweight
107,110,31.0,Male,190.0,20,8.3,Underweight
108,111,30.0,,175.0,80,,Normal Weight


## Show DataFrame

In [59]:
data.describe()

Unnamed: 0,ID,Age,Height,Weight,BMI
count,110.0,109.0,109.0,110.0,109.0
mean,57.054545,46.174312,166.651376,59.681818,20.498165
std,32.487919,24.598571,27.756006,28.657078,7.567315
min,1.0,11.0,120.0,10.0,3.9
25%,29.25,27.0,140.0,35.0,16.7
50%,57.5,42.0,175.0,57.5,21.2
75%,84.75,58.0,190.0,83.75,26.1
max,112.0,112.0,210.0,120.0,37.2


# PreProcessing

## Clean Missing Value
(Dropna)

In [64]:
data = data.dropna()
data.head()

Unnamed: 0,ID,Age,Gender,Height,Weight,BMI,Label
1,2,30.0,Female,160.0,60,22.5,Normal Weight
2,3,35.0,Male,180.0,90,27.3,Overweight
3,4,40.0,Female,150.0,50,20.0,Underweight
4,5,45.0,Male,190.0,100,31.2,Obese
5,6,50.0,Female,140.0,40,16.7,Underweight


In [74]:
data = data.drop('ID', axis = 1)
data
# as it has no importance at all

Unnamed: 0,Age,Gender,Height,Weight,BMI,Label
1,30.0,Female,160.0,"(55, 85]",22.5,Normal Weight
2,35.0,Male,180.0,"(85, 125]",27.3,Overweight
3,40.0,Female,150.0,"(35, 55]",20.0,Underweight
4,45.0,Male,190.0,"(85, 125]",31.2,Obese
5,50.0,Female,140.0,"(35, 55]",16.7,Underweight
...,...,...,...,...,...,...
103,11.0,Male,175.0,"(5, 35]",3.9,Underweight
104,16.0,Female,160.0,"(5, 35]",3.9,Underweight
105,21.0,Male,180.0,"(5, 35]",5.6,Underweight
106,26.0,Female,150.0,"(5, 35]",5.6,Underweight


In [75]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 106 entries, 1 to 107
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   Age     106 non-null    float64 
 1   Gender  106 non-null    object  
 2   Height  106 non-null    float64 
 3   Weight  106 non-null    category
 4   BMI     106 non-null    float64 
 5   Label   106 non-null    object  
dtypes: category(1), float64(3), object(2)
memory usage: 9.4+ KB


## Binnig
Weight===>
    Label=(Small Weight, Medium Weight, High Weight, Very Hight Weight)
    
   bins=(5 ===> 125) 
    )

In [None]:
bins_Weight = [5, 35, 55, 85, 125]

In [None]:
categories_Weight = pd.cut(data['Weight'], bins_Weight)

In [66]:
data['Weight'] = categories_Weight
data['Weight']

1       (55, 85]
2      (85, 125]
3       (35, 55]
4      (85, 125]
5       (35, 55]
         ...    
103      (5, 35]
104      (5, 35]
105      (5, 35]
106      (5, 35]
107      (5, 35]
Name: Weight, Length: 106, dtype: category
Categories (4, interval[int64, right]): [(5, 35] < (35, 55] < (55, 85] < (85, 125]]

In [None]:
# I will bin the height as well 

In [174]:
bins_Height = [115, 140, 175, 190, 210]

In [175]:
categories_Height = pd.cut(data['Height'], bins_Height)

In [176]:
data['Height'] = categories_Height
data['Height']

1      (140, 175]
2      (175, 190]
3      (140, 175]
4      (175, 190]
5      (115, 140]
          ...    
103    (140, 175]
104    (140, 175]
105    (175, 190]
106    (140, 175]
107    (175, 190]
Name: Height, Length: 106, dtype: category
Categories (4, interval[int64, right]): [(115, 140] < (140, 175] < (175, 190] < (190, 210]]

## Encoding
(Gender- Label- Weight_binnig)

In [177]:
le = LabelEncoder()
data['Gender'] = le.fit_transform(data['Gender'])
data['Weight'] = le.fit_transform(data['Weight'])
data['Label'] = le.fit_transform(data['Label'])
data['Height'] = le.fit_transform(data['Height'])
data

Unnamed: 0,Age,Gender,Height,Weight,BMI,Label
1,30.0,0,1,2,22.5,0
2,35.0,1,2,3,27.3,2
3,40.0,0,1,1,20.0,3
4,45.0,1,2,3,31.2,1
5,50.0,0,0,1,16.7,3
...,...,...,...,...,...,...
103,11.0,1,1,0,3.9,3
104,16.0,0,1,0,3.9,3
105,21.0,1,2,0,5.6,3
106,26.0,0,1,0,5.6,3


# Modeling 

## Split data

In [257]:
# X= All Feature except "Label"
X = data.drop('Label', axis = 1)
# y= feature "Label"
y = data['Label']

## Train Model

In [466]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
log_reg = LogisticRegression(penalty = 'l2', C = 1, max_iter = 10000)
log_reg.fit(X_train, y_train)

## y_Prediction

In [467]:
y_pred = log_reg.predict(X_test)

## Accuracy

In [468]:
acc = accuracy_score(y_test, y_pred)
acc

0.9090909090909091

# Smotting

In [469]:
smote = SMOTE()

In [470]:
X_smote, y_smote = smote.fit_resample(X, y)

In [491]:
X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(X_smote, y_smote, test_size=0.3, random_state=46)

In [492]:
log_reg_smote = LogisticRegression(penalty ='l2', C=1, max_iter=100000)
log_reg_smote.fit(X_train_smote, y_train_smote)

In [493]:
y_pred_smote = log_reg_smote.predict(X_test_smote)

In [494]:
acc_smote = accuracy_score(y_test_smote, y_pred_smote)
acc_smote

1.0

# Small Feedback

In [None]:
# It was a problem to teach us Perceptron while the dataset is multiclass, the accuracy with perceptron was 45.4%, you should
# have said this task be answered in Log_Regression 