# IMPORTING NECESSARY LIBRARIES

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler , LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# LOADING THE DATASET

In [2]:
data = pd.read_csv('../input/obesity-classification-dataset/Obesity Classification.csv')
data.head()

Unnamed: 0,ID,Age,Gender,Height,Weight,BMI,Label
0,1,25,Male,175,80,25.3,Normal Weight
1,2,30,Female,160,60,22.5,Normal Weight
2,3,35,Male,180,90,27.3,Overweight
3,4,40,Female,150,50,20.0,Underweight
4,5,45,Male,190,100,31.2,Obese


# DATA PREPROCESSING

In [3]:
data.shape

(108, 7)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ID      108 non-null    int64  
 1   Age     108 non-null    int64  
 2   Gender  108 non-null    object 
 3   Height  108 non-null    int64  
 4   Weight  108 non-null    int64  
 5   BMI     108 non-null    float64
 6   Label   108 non-null    object 
dtypes: float64(1), int64(4), object(2)
memory usage: 6.0+ KB


# DATA CLEANING

### DROPPING UNWANTED COLUMNS

In [5]:
data.drop(["ID","Gender"],axis=1,inplace=True)
data.head()

Unnamed: 0,Age,Height,Weight,BMI,Label
0,25,175,80,25.3,Normal Weight
1,30,160,60,22.5,Normal Weight
2,35,180,90,27.3,Overweight
3,40,150,50,20.0,Underweight
4,45,190,100,31.2,Obese


### CHECK FOR NULL VALUES

In [6]:
data.isnull().sum()

Age       0
Height    0
Weight    0
BMI       0
Label     0
dtype: int64

### CHECK FOR DUPLICATES

In [7]:
data.duplicated().sum()

0

### CHECK FOR SKEWNESS

In [8]:
data.skew()

  """Entry point for launching an IPython kernel.


Age       0.821815
Height   -0.112888
Weight    0.196069
BMI      -0.283914
dtype: float64

### FEATURE ENCODING

In [10]:
le=LabelEncoder()
data["Label"]= le.fit_transform(data["Label"])
data.head()

Unnamed: 0,Age,Height,Weight,BMI,Label
0,25,175,80,25.3,0
1,30,160,60,22.5,0
2,35,180,90,27.3,2
3,40,150,50,20.0,3
4,45,190,100,31.2,1


# SPLITTING THE DATA

In [11]:
x = data.drop(["Label"],axis=1)
y = data["Label"]

In [12]:
X_train,X_test,y_train,y_test = train_test_split(x , y ,train_size=0.80)

# FEATURE SCALING

In [13]:
sc = StandardScaler()
x_train = sc.fit(X_train)
x_test = sc.fit_transform(X_test)

# BUILDING THE MODEL 

In [14]:
class Naivebayes:
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self._classes = np.unique(y)
        n_classes = len(self._classes)

        # Calculate mean, variance, and prior for each class
        self._mean = np.zeros((n_classes, n_features), dtype=np.float64)
        self._var = np.zeros((n_classes, n_features), dtype=np.float64)
        self._priors = np.zeros(n_classes, dtype=np.float64)

        for idx, c in enumerate(self._classes):
            X_c = X[y == c]
            self._mean[idx, :] = X_c.mean(axis=0)
            self._var[idx, :] = X_c.var(axis=0)
            self._priors[idx] = X_c.shape[0] / float(n_samples)

    def predict(self, X):
        y_pred = [self._predict(x) for x in X.values]
        return np.array(y_pred)

    def _predict(self, x):
        posteriors = []

        # Calculate posterior probability for each class
        for idx, c in enumerate(self._classes):
            prior = np.log(self._priors[idx])
            posterior = np.sum(np.log(self._pdf(idx, x)))
            posterior = prior + posterior
            posteriors.append(posterior)
            
        # Return class with the highest posterior probability
        return self._classes[np.argmax(posteriors)]
        
    def _pdf(self, class_idx, x):
        mean = self._mean[class_idx]
        var = self._var[class_idx]
        numerator = np.exp(- (x - mean) ** 2 / (2 * var))
        denominator = np.sqrt(2 * np.pi * var)
        return numerator / denominator

# IMPLEMENTING THE MODEL

In [15]:
naive = Naivebayes()
naive.fit(X_train,y_train)
y_pred = naive.predict(X_test)

In [16]:
y_pred

array([3, 2, 3, 3, 0, 1, 3, 3, 0, 1, 0, 2, 1, 0, 2, 0, 3, 1, 3, 3, 0, 3])

### CHECK THE ACCURACY

In [17]:
accuracy_score(y_test,y_pred)

0.9090909090909091

# IMPLEMENTING THE MODEL USING Scikit Learn Library

In [18]:
nb = GaussianNB()
nb.fit(X_train,y_train)
y_hat = nb.predict(X_test)

In [19]:
accuracy_score(y_test,y_hat)

0.9090909090909091