<a href="https://colab.research.google.com/github/Mayank0875/Breed-Classifier/blob/main/Breed_Classifier_using_Naive_Bayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

from scipy.special import erfinv
from scipy.stats import norm, binom

import matplotlib.pyplot as plt
import seaborn as sns

# *Data Gernator using distribution*

In [2]:
def uniform_gernator(a, b, num_values):
  array = np.random.uniform(a, b, num_values)
  return array

In [3]:
def normal_gernator(mu, sigma, num_values):
  y = uniform_gernator(0, 1, num_values)

  x = mu + sigma * np.sqrt(2) * erfinv(2*y - 1)
  return x

In [4]:
def binomial_gernator(n, p, num_values):
  y = uniform_gernator(0, 1, num_values)

  x = binom.ppf(y, n, p)
  return x

# *Gernating Data*

In [5]:
features = ['height', 'weight', 'bark_days', 'ear_head_ratio']

In [6]:
from dataclasses import dataclass


@dataclass
class gaussian_params:
  mu: float
  sigma: float

  def __repr__(self):
    return f'gaussian_parms mu: {self.mu} and sigma: {self.sigma}'

@dataclass
class binomial_params:
  n: int
  p: float

  def __repr__(self):
    return f'binomial_parms n: {self.n} and p: {self.p}'

@dataclass
class uniform_params:
  a: float
  b: float

  def __repr__(self):
    return f'uniform_parms a: {self.a} and b: {self.b}'

In [7]:
breed_params = {
    0: {
        "height": gaussian_params(mu=35, sigma=1.5),
        "weight": gaussian_params(mu=20, sigma=1),
        "bark_days": binomial_params(n=30, p=0.8),
        "ear_head_ratio": uniform_params(a=0.6, b=0.1)
    },

    1: {
        "height": gaussian_params(mu=30, sigma=2),
        "weight": gaussian_params(mu=25, sigma=5),
        "bark_days": binomial_params(n=30, p=0.5),
        "ear_head_ratio": uniform_params(a=0.2, b=0.5)
    },

    2: {
        "height": gaussian_params(mu=40, sigma=3.5),
        "weight": gaussian_params(mu=32, sigma=3),
        "bark_days": binomial_params(n=30, p=0.3),
        "ear_head_ratio": uniform_params(a=0.1, b=0.3)
    }

}

In [8]:
def gernate_data_from_breed(breed, features, params, num_values):
  df = pd.DataFrame()

  for feature in features:
    match feature:
      case 'height' | 'weight':
        df[feature] = normal_gernator(mu=params[breed][feature].mu, sigma=params[breed][feature].sigma, num_values=num_values)
      case 'bark_days':
        df[feature] = binomial_gernator(n=params[breed][feature].n, p=params[breed][feature].p, num_values=num_values)
      case 'ear_head_ratio':
        df[feature] = uniform_gernator(a=params[breed][feature].a, b=params[breed][feature].b, num_values=num_values)
  df['breed'] = breed
  return df

In [9]:
df_0 = gernate_data_from_breed(0, features, breed_params, 931)
df_1 = gernate_data_from_breed(1, features, breed_params, 1000)
df_2 = gernate_data_from_breed(2, features, breed_params, 1129)

In [10]:
df = pd.concat([df_0, df_1, df_2]).reset_index(drop=True)

In [11]:
df.sample(frac = 1)

Unnamed: 0,height,weight,bark_days,ear_head_ratio,breed
104,36.531079,18.797919,21.0,0.361325,0
2571,39.709758,36.035831,6.0,0.114298,2
740,35.720512,20.385218,22.0,0.508097,0
1353,28.179801,20.929590,13.0,0.398313,1
46,35.982859,20.830890,23.0,0.407183,0
...,...,...,...,...,...
1292,29.409362,25.249165,16.0,0.267811,1
1018,31.205767,31.543100,12.0,0.444061,1
1638,31.474033,24.696042,17.0,0.453162,1
2853,42.316235,31.274056,12.0,0.233978,2


In [12]:
split = int(len(df) * 0.7)
X_train = df[ : split]
y_train = df[split : ]

In [13]:
X_train.shape, y_train.shape

((2142, 5), (918, 5))

# *PDF For distribution*

In [14]:
def uniform_pdf(x, a, b):
  return 0 if(x < a or x > b) else 1 / (b - a)

In [15]:
def normal_pdf(x, mu, sigma):
  coff = 1.0 / (np.sqrt(2 * np.pi))
  expo = -0.5 * ((x - mu) / (2 * sigma))**2
  pdf = coff * np.exp(expo)
  return pdf


In [16]:
def binomial_pdf(x, n, p):
  return binom.pmf(x, n, p)

# *Estimating Parameters on Training Data*

In [17]:
def estimate_params(X_train, feature):
  probs_dict = {}
  params_dict = {}

  for breed in X_train['breed'].unique():
    probs_dict[breed] = len(X_train[X_train['breed'] == breed]) / len(X_train)

    inner_dict = {}
    for value in feature:
      match value:
        case 'height' | 'weight':
          mu = X_train[X_train['breed'] == breed][value].mean()
          sigma = X_train[X_train['breed'] == breed][value].std()
          params = gaussian_params(mu, sigma)
        case 'bark_days':
          n = X_train[X_train['breed'] == breed][value].max()
          p = X_train[X_train['breed'] == breed][value].mean() / n  # don't known how it come
          params = binomial_params(n, p)
        case 'ear_head_ratio':
          a = X_train[X_train['breed'] == breed][value].min()
          b = X_train[X_train['breed'] == breed][value].max()
          params = uniform_params(a, b)

      inner_dict[value] = params

    params_dict[breed] = inner_dict

  return probs_dict, params_dict


In [18]:
# estimating prob and params

train_probs, train_params = estimate_params(X_train, features)
print(train_probs)
print(train_params)

{np.int64(0): 0.434640522875817, np.int64(1): 0.4668534080298786, np.int64(2): 0.09850606909430439}
{np.int64(0): {'height': gaussian_parms mu: 35.018734370082875 and sigma: 1.4906380511919468, 'weight': gaussian_parms mu: 20.01270553038079 and sigma: 0.9910302838018684, 'bark_days': binomial_parms n: 30.0 and p: 0.799140708915145, 'ear_head_ratio': uniform_parms a: 0.10022711661139905 and b: 0.5995462407739962}, np.int64(1): {'height': gaussian_parms mu: 29.895857508727204 and sigma: 1.9711437625677672, 'weight': gaussian_parms mu: 24.798622278122266 and sigma: 4.992060886265897, 'bark_days': binomial_parms n: 25.0 and p: 0.60056, 'ear_head_ratio': uniform_parms a: 0.20000667798579871 and b: 0.4997650943762456}, np.int64(2): {'height': gaussian_parms mu: 39.90188319356052 and sigma: 3.3616030004040733, 'weight': gaussian_parms mu: 32.034404059772285 and sigma: 2.991907715867889, 'bark_days': binomial_parms n: 16.0 and p: 0.5633886255924171, 'ear_head_ratio': uniform_parms a: 0.1004806

# *Predict Probablity For Featue*

In [19]:
def prob_of_x_given_breed(X, breed, feature, params):
  probablity = 1

  for x,value in zip(X,feature):
    match value:
      case 'height' | 'weight':
        probablity *= normal_pdf(x, params[breed][value].mu, params[breed][value].sigma)
      case 'bark_days':
        probablity *=  binomial_pdf(x, params[breed][value].n, params[breed][value].p)
      case 'ear_head_ratio':
        probablity *= uniform_pdf(x, params[breed][value].a, params[breed][value].b)
  return probablity

In [20]:
# tesing with sigle featue

X = y_train[features].iloc[0]

prob_with_0 = prob_of_x_given_breed(X, 0, features, train_params)
prob_with_1 = prob_of_x_given_breed(X, 1, features, train_params)
prob_with_2 = prob_of_x_given_breed(X, 2, features, train_params)

print(prob_with_0)
print(prob_with_1)
print(prob_with_2)


9.687101541613507e-11
0.0
0.005121724970215788


In [21]:
def predict_breed(x, features, params):
  prob_with_0 = prob_of_x_given_breed(x, 0, features, params)
  prob_with_1 = prob_of_x_given_breed(x, 1, features, params)
  prob_with_2 = prob_of_x_given_breed(x, 2, features, params)

  max_prob = np.array([prob_with_0, prob_with_1, prob_with_2])
  breed =  np.argmax(max_prob)
  return breed

In [22]:
predict_breed(X, features, train_params)

np.int64(2)

In [23]:
def accuracy_score(y_true, y_pred):
  return np.sum(y_true == y_pred) / len(y_true)

In [24]:
preds = y_train.apply(lambda x: predict_breed([*x[features]], features, train_params), axis=1)
test_acc = accuracy_score(y_train["breed"], preds)

print(f"Accuracy score for the test split: {test_acc:.2f}")

Accuracy score for the test split: 0.97
