In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv(r"C:\Users\PC\Downloads\gender_classification_v7.csv")
df

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long,gender
0,1,11.8,6.1,1,0,1,1,Male
1,0,14.0,5.4,0,0,1,0,Female
2,0,11.8,6.3,1,1,1,1,Male
3,0,14.4,6.1,0,1,1,1,Male
4,1,13.5,5.9,0,0,0,0,Female
...,...,...,...,...,...,...,...,...
4996,1,13.6,5.1,0,0,0,0,Female
4997,1,11.9,5.4,0,0,0,0,Female
4998,1,12.9,5.7,0,0,0,0,Female
4999,1,13.2,6.2,0,0,0,0,Female


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5001 entries, 0 to 5000
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   long_hair                  5001 non-null   int64  
 1   forehead_width_cm          5001 non-null   float64
 2   forehead_height_cm         5001 non-null   float64
 3   nose_wide                  5001 non-null   int64  
 4   nose_long                  5001 non-null   int64  
 5   lips_thin                  5001 non-null   int64  
 6   distance_nose_to_lip_long  5001 non-null   int64  
 7   gender                     5001 non-null   object 
dtypes: float64(2), int64(5), object(1)
memory usage: 312.7+ KB


In [4]:
df.isna().sum()

long_hair                    0
forehead_width_cm            0
forehead_height_cm           0
nose_wide                    0
nose_long                    0
lips_thin                    0
distance_nose_to_lip_long    0
gender                       0
dtype: int64

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder

In [6]:
Encoder = LabelEncoder()
df["gender"] = Encoder.fit_transform(df["gender"])

In [7]:
df["gender"]

0       1
1       0
2       1
3       1
4       0
       ..
4996    0
4997    0
4998    0
4999    0
5000    1
Name: gender, Length: 5001, dtype: int64

In [8]:
X = df.drop(columns=["gender"])
y = df["gender"]

In [9]:
X

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long
0,1,11.8,6.1,1,0,1,1
1,0,14.0,5.4,0,0,1,0
2,0,11.8,6.3,1,1,1,1
3,0,14.4,6.1,0,1,1,1
4,1,13.5,5.9,0,0,0,0
...,...,...,...,...,...,...,...
4996,1,13.6,5.1,0,0,0,0
4997,1,11.9,5.4,0,0,0,0
4998,1,12.9,5.7,0,0,0,0
4999,1,13.2,6.2,0,0,0,0


In [10]:
y

0       1
1       0
2       1
3       1
4       0
       ..
4996    0
4997    0
4998    0
4999    0
5000    1
Name: gender, Length: 5001, dtype: int64

In [11]:
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size=0.2 , stratify=y , random_state= 42)

In [12]:
print(f"Number of Training Dataset : {X_train.shape}")
print(f"Number of Test Dataset : {X_test.shape}")

Number of Training Dataset : (4000, 7)
Number of Test Dataset : (1001, 7)


In [13]:
scaled = StandardScaler()
X_train_scaled = scaled.fit_transform(X_train)
X_test_scaled = scaled.transform(X_test)

In [14]:
from sklearn.svm import SVC

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score

In [15]:
SVM_Model = SVC(kernel='linear',C=1.0)
SVM_Model.fit(X_train_scaled,y_train)


In [16]:
SVM_prediction = SVM_Model.predict(X_test_scaled)
svm_accuracy = accuracy_score(y_test,SVM_prediction)

In [17]:
svm_accuracy

0.961038961038961

In [19]:
GaussianNB_Model = GaussianNB()
GaussianNB_Model.fit(X_train_scaled,y_train)

In [20]:
GaussianNB_prediction = GaussianNB_Model.predict(X_test_scaled)
GaussianNB_accuracy = accuracy_score(y_test , GaussianNB_prediction)

In [21]:
GaussianNB_accuracy

0.9690309690309691

In [22]:
BernoulliNB_Model = BernoulliNB()
BernoulliNB_Model.fit(X_train_scaled,y_train)

In [23]:
BernoulliNB_prediction = BernoulliNB_Model.predict(X_test_scaled)
BernoulliNB_accuracy = accuracy_score(y_test,BernoulliNB_prediction)

In [24]:
BernoulliNB_accuracy

0.955044955044955

# Multinomial Naive Bayes — Handling Negative Feature Values

## Overview
`Multinomial Naive Bayes (MultinomialNB)` is a probabilistic classification algorithm designed specifically for **discrete, non-negative feature values** such as counts or frequencies.  
A common implementation mistake occurs when applying **feature scaling techniques that introduce negative values**, leading to runtime errors during model training.

This document explains:
- Why the error occurs
- The mathematical assumptions behind `MultinomialNB`
- Correct and incorrect preprocessing techniques
- Best practices for professional machine learning pipelines

---

## Error Description

### Raised Exception
```text
ValueError: Negative values in data passed to MultinomialNB (input X)


In [25]:
MultinomialNB_Model = MultinomialNB()
MultinomialNB_Model.fit(X_train_scaled,y_train)

ValueError: Negative values in data passed to MultinomialNB (input X).

In [31]:
results_df = pd.DataFrame(
    {
        'SVM Model': svm_accuracy,
        'GaussianNB Model': GaussianNB_accuracy,
        'BernoulliNB Model': BernoulliNB_accuracy
    },
    index=['Accuracy Score']
).T

In [32]:
results_df 

Unnamed: 0,Accuracy Score
SVM Model,0.961039
GaussianNB Model,0.969031
BernoulliNB Model,0.955045
