In [45]:
# Lab 7: Naive Bayes Implementation
# Local environment setup - no Google Colab needed
print("Starting Naive Bayes lab in local environment")

Starting Naive Bayes lab in local environment


In [46]:
import os
# Working in local environment - current directory should be fine
print(f"Current working directory: {os.getcwd()}")

Current working directory: /home/krishom/College/AI-ML/Labs/lab7


In [47]:
!ls

naive_bayes.ipynb  Naive_Bayes_New.ipynb


In [70]:
import pandas as pd
import numpy as np
file_name = "https://docs.google.com/spreadsheets/d/1vetsYXIuqYsb9MUHJvL5PbVbnGZdeAtSzZvXqZqetAU"+ '/export?gid=0&format=xlsx'
df = pd.read_excel(file_name,index_col=0, sheet_name="example-1",engine='openpyxl')
print(df)

      Outlook Temperature Humidity    Wind Play Tennis
D1      Sunny         Hot     High    Weak          No
D2      Sunny         Hot     High  Strong          No
D3   Overcast         Hot     High    Weak         Yes
D4       Rain        Mild     High    Weak         Yes
D5       Rain        Cool   Normal    Weak         Yes
D6       Rain        Cool   Normal  Strong          No
D7   Overcast        Cool   Normal  Strong         Yes
D8      Sunny        Mild     High    Weak          No
D9      Sunny        Cool   Normal    Weak         Yes
D10      Rain        Mild   Normal    Weak         Yes
D11     Sunny        Mild   Normal  Strong         Yes
D12  Overcast        Mild     High  Strong         Yes
D13  Overcast         Hot   Normal    Weak         Yes
D14      Rain        Mild     High  Strong          No


In [49]:
X, y=df.drop(["Play Tennis"],axis=1),df["Play Tennis"]
print(X.shape)
print(y)

(14, 4)
D1      No
D2      No
D3     Yes
D4     Yes
D5     Yes
D6      No
D7     Yes
D8      No
D9     Yes
D10    Yes
D11    Yes
D12    Yes
D13    Yes
D14     No
Name: Play Tennis, dtype: object


In [50]:
def accuracy_score(y_true, y_pred):

	"""	score = (y_true - y_pred) / len(y_true) """

	return round(float(sum(y_pred == y_true))/float(len(y_true)) * 100 ,2)

In [51]:
class  NaiveBayes:

	"""
		Bayes Theorem:
										Likelihood * Class prior probability
				Posterior Probability = -------------------------------------
											Predictor prior probability

							  			 P(x|c) * p(c)
							   P(c|x) = ------------------
											  P(x)
	"""

	def __init__(self):

		"""
			Attributes:
				likelihoods: Likelihood of each feature per class
				class_priors: Prior probabilities of classes
				pred_priors: Prior probabilities of features
				features: All features of dataset
		"""
		self.features = list
		self.likelihoods = {}
		self.class_priors = {}
		self.pred_priors = {}

		self.X_train = np.array
		self.y_train = np.array
		self.train_size = int
		self.num_feats = int

	def fit(self, X, y):

		self.features = list(X.columns)
		self.X_train = X
		self.y_train = y
		self.train_size = X.shape[0]
		self.num_feats = X.shape[1]

		for feature in self.features:
			self.likelihoods[feature] = {}
			self.pred_priors[feature] = {}

			for feat_val in np.unique(self.X_train[feature]):
				self.pred_priors[feature].update({feat_val: 0})

				for outcome in np.unique(self.y_train):
					self.likelihoods[feature].update({feat_val+'_'+outcome:0})
					self.class_priors.update({outcome: 0})

		self._calc_class_prior()
		self._calc_likelihoods()
		self._calc_predictor_prior()

	def _calc_class_prior(self):

		""" P(c) - Prior Class Probability """

		for outcome in np.unique(self.y_train):
			outcome_count = sum(self.y_train == outcome)
			self.class_priors[outcome] = outcome_count / self.train_size

	def _calc_likelihoods(self):

		""" P(x|c) - Likelihood """

		for feature in self.features:

			for outcome in np.unique(self.y_train):
				outcome_count = sum(self.y_train == outcome)
				feat_likelihood = self.X_train[feature][self.y_train[self.y_train == outcome].index.values.tolist()].value_counts().to_dict()

				for feat_val, count in feat_likelihood.items():
					self.likelihoods[feature][feat_val + '_' + outcome] = count/outcome_count


	def _calc_predictor_prior(self):

		""" P(x) - Evidence """

		for feature in self.features:
			feat_vals = self.X_train[feature].value_counts().to_dict()

			for feat_val, count in feat_vals.items():
				self.pred_priors[feature][feat_val] = count/self.train_size


	def predict(self, X):

		""" Calculates Posterior probability P(c|x) """

		results = []
		X = np.array(X)

		for query in X:
			probs_outcome = {}
			for outcome in np.unique(self.y_train):
				prior = self.class_priors[outcome]
				likelihood = 1
				evidence = 1

				for feat, feat_val in zip(self.features, query):
					likelihood *= self.likelihoods[feat][feat_val + '_' + outcome]
					evidence *= self.pred_priors[feat][feat_val]

				# posterior = (likelihood * prior) / (evidence)
				posterior = (likelihood * prior)

				probs_outcome[outcome] = posterior

			result = max(probs_outcome, key = lambda x: probs_outcome[x])
			print(probs_outcome)
			results.append(result)

		return np.array(results)


In [71]:
nb_clf = NaiveBayes()
nb_clf.fit(X, y)

print("Train Accuracy: {}".format(accuracy_score(y, nb_clf.predict(X))))

#Query 1:
# Outlook Temperature Humidity    Wind Play Tennis
query = np.array([['Sunny','Cool','High','Strong']])
print("Query:- {} ---> {}".format(query, nb_clf.predict(query)))


UFuncTypeError: ufunc 'add' did not contain a loop with signature matching types (dtype('int64'), dtype('<U1')) -> None

In [53]:
nb_clf.pred_priors

{'Outlook': {'Overcast': 0.2857142857142857,
  'Rain': 0.35714285714285715,
  'Sunny': 0.35714285714285715},
 'Temperature': {'Cool': 0.2857142857142857,
  'Hot': 0.2857142857142857,
  'Mild': 0.42857142857142855},
 'Humidity': {'High': 0.5, 'Normal': 0.5},
 'Wind': {'Strong': 0.42857142857142855, 'Weak': 0.5714285714285714}}

In [54]:
nb_clf.class_priors

{'No': 0.35714285714285715, 'Yes': 0.6428571428571429}

In [55]:
nb_clf.likelihoods

{'Outlook': {'Overcast_No': 0,
  'Overcast_Yes': 0.4444444444444444,
  'Rain_No': 0.4,
  'Rain_Yes': 0.3333333333333333,
  'Sunny_No': 0.6,
  'Sunny_Yes': 0.2222222222222222},
 'Temperature': {'Cool_No': 0.2,
  'Cool_Yes': 0.3333333333333333,
  'Hot_No': 0.4,
  'Hot_Yes': 0.2222222222222222,
  'Mild_No': 0.4,
  'Mild_Yes': 0.4444444444444444},
 'Humidity': {'High_No': 0.8,
  'High_Yes': 0.3333333333333333,
  'Normal_No': 0.2,
  'Normal_Yes': 0.6666666666666666},
 'Wind': {'Strong_No': 0.6,
  'Strong_Yes': 0.3333333333333333,
  'Weak_No': 0.4,
  'Weak_Yes': 0.6666666666666666}}

In [56]:
list(y.unique())

['No', 'Yes']

In [57]:
df_new = df

In [58]:
df_new = df_new.replace(["Sunny","Hot","Strong","High","Yes"], 1)
df_new = df_new.replace(["Rain","Cool","Normal","Weak","No"], 0)
df_new = df_new.replace(["Overcast","Mild"], 2)
df_new

  df_new = df_new.replace(["Rain","Cool","Normal","Weak","No"], 0)
  df_new = df_new.replace(["Overcast","Mild"], 2)


Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play Tennis
D1,1,1,1,0,0
D2,1,1,1,1,0
D3,2,1,1,0,1
D4,0,2,1,0,1
D5,0,0,0,0,1
D6,0,0,0,1,0
D7,2,0,0,1,1
D8,1,2,1,0,0
D9,1,0,0,0,1
D10,0,2,0,0,1


In [59]:
X, y=df_new.drop(["Play Tennis"],axis=1),df_new["Play Tennis"]
print(X.shape)
print(y)

(14, 4)
D1     0
D2     0
D3     1
D4     1
D5     1
D6     0
D7     1
D8     0
D9     1
D10    1
D11    1
D12    1
D13    1
D14    0
Name: Play Tennis, dtype: int64


In [60]:
import numpy as np

class NaiveBayesClassifier:
    def __init__(self):
        self.priors = None
        self.conditional_probs = None

    def fit(self, X, y):
        """
        Fits the Naive Bayes classifier to the training data.

        Args:
            X: A numpy array of shape (n_samples, n_features) representing the features.
            y: A numpy array of shape (n_samples,) representing the target labels.
        """

        n_samples, n_features = X.shape
        n_classes = len(np.unique(y))

        # Calculate priors
        self.priors = np.zeros(n_classes)
        for i in range(n_classes):

            self.priors[i] = np.sum(y == i) / n_samples

        # Calculate conditional probabilities
        self.conditional_probs = np.zeros((n_classes, n_features))
        for i in range(n_classes):
            X_class = X[y == i]
            for j in range(n_features):
                print(np.mean(X_class),y,i)
                # self.conditional_probs[i, j] = np.mean(X_class[:, j])
                print("hello")

    def predict(self, X):
        """
        Predicts the class labels for new data.

        Args:
            X: A numpy array of shape (n_samples, n_features) representing the new data.

        Returns:
            A numpy array of shape (n_samples,) representing the predicted class labels.
        """

        n_samples, n_features = X.shape
        n_classes = len(self.priors)

        # Calculate posterior probabilities
        posteriors = np.zeros((n_samples, n_classes))
        for i in range(n_classes):
            for j in range(n_samples):
                posteriors[j, i] = np.log(self.priors[i]) + np.sum(
                    np.log(self.conditional_probs[i, :]) * X[j, :]
                )

        # Predict the class with the highest posterior probability
        return np.argmax(posteriors, axis=1)

In [61]:
clf = NaiveBayesClassifier()
clf.fit(X, y)

# Predict on new data
# y_pred = clf.predict(X_test)

0.8 D1     0
D2     0
D3     1
D4     1
D5     1
D6     0
D7     1
D8     0
D9     1
D10    1
D11    1
D12    1
D13    1
D14    0
Name: Play Tennis, dtype: int64 0
hello
0.8 D1     0
D2     0
D3     1
D4     1
D5     1
D6     0
D7     1
D8     0
D9     1
D10    1
D11    1
D12    1
D13    1
D14    0
Name: Play Tennis, dtype: int64 0
hello
0.8 D1     0
D2     0
D3     1
D4     1
D5     1
D6     0
D7     1
D8     0
D9     1
D10    1
D11    1
D12    1
D13    1
D14    0
Name: Play Tennis, dtype: int64 0
hello
0.8 D1     0
D2     0
D3     1
D4     1
D5     1
D6     0
D7     1
D8     0
D9     1
D10    1
D11    1
D12    1
D13    1
D14    0
Name: Play Tennis, dtype: int64 0
hello
0.7222222222222222 D1     0
D2     0
D3     1
D4     1
D5     1
D6     0
D7     1
D8     0
D9     1
D10    1
D11    1
D12    1
D13    1
D14    0
Name: Play Tennis, dtype: int64 1
hello
0.7222222222222222 D1     0
D2     0
D3     1
D4     1
D5     1
D6     0
D7     1
D8     0
D9     1
D10    1
D11    1
D12    1
D13    1

In [62]:
import numpy as np

class NaiveBayesClassifier:
    def __init__(self):
        self.priors = None
        self.conditional_probs = None

    def fit(self, X, y):
        """
        Fits the Naive Bayes classifier to the training data.

        Args:
            X: A numpy array of shape (n_samples, n_features) representing the features.
            y: A numpy array of shape (n_samples,) representing the target labels.
        """

        n_samples, n_features = X.shape
        n_classes = len(np.unique(y))

        # Calculate priors
        self.priors = np.zeros(n_classes)
        for i in range(n_classes):
            self.priors[i] = np.sum(y == i) / n_samples

        # Calculate conditional probabilities
        self.conditional_probs = np.zeros((n_classes, n_features))
        for i in range(n_classes):
            X_class = X[y == i]
            for j in range(n_features):
                self.conditional_probs[i, j] = np.mean(X_class[:, j])

    def predict(self, X):
        """
        Predicts the class labels for new data.

        Args:
            X: A numpy array of shape (n_samples, n_features) representing the new data.

        Returns:
            A numpy array of shape (n_samples,) representing the predicted class labels.
        """

        n_samples, n_features = X.shape
        n_classes = len(self.priors)

        # Calculate posterior probabilities
        posteriors = np.zeros((n_samples, n_classes))
        for i in range(n_classes):
            for j in range(n_samples):
                posteriors[j, i] = np.log(self.priors[i]) + np.sum(
                    np.log(self.conditional_probs[i, :]) * X[j, :]
                )

        # Predict the class with the highest posterior probability
        return np.argmax(posteriors, axis=1)

# Example 2: Student Admission Prediction

Let's create another example using a Student Admission dataset to predict whether a student gets admitted based on their academic profile.

In [72]:
# Create a Student Admission Dataset
# Similar structure to tennis dataset but for academic admission prediction
data_admission = {
    'GPA': ['High', 'High', 'Medium', 'Low', 'High', 'Medium', 'Low', 'High', 
            'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium', 'High',
            'Low', 'High', 'Medium', 'Low'],
    'Test_Score': ['High', 'Medium', 'High', 'Low', 'High', 'Medium', 'Low', 'High',
                   'High', 'Medium', 'Low', 'High', 'High', 'Low', 'Medium', 'High',
                   'Medium', 'High', 'Medium', 'Low'],
    'Extracurricular': ['Yes', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes',
                        'No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes',
                        'No', 'Yes', 'Yes', 'No'],
    'Recommendation': ['Strong', 'Strong', 'Average', 'Weak', 'Strong', 'Average', 'Weak', 'Strong',
                       'Average', 'Strong', 'Weak', 'Average', 'Strong', 'Weak', 'Average', 'Strong',
                       'Average', 'Strong', 'Average', 'Weak'],
    'Admitted': ['Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes',
                 'Yes', 'Yes', 'No', 'No', 'Yes', 'No', 'No', 'Yes',
                 'No', 'Yes', 'Yes', 'No']
}

df_admission = pd.DataFrame(data_admission)
print(df_admission)

       GPA Test_Score Extracurricular Recommendation Admitted
0     High       High             Yes         Strong      Yes
1     High     Medium             Yes         Strong      Yes
2   Medium       High              No        Average      Yes
3      Low        Low              No           Weak       No
4     High       High             Yes         Strong      Yes
5   Medium     Medium             Yes        Average      Yes
6      Low        Low              No           Weak       No
7     High       High             Yes         Strong      Yes
8   Medium       High              No        Average      Yes
9     High     Medium             Yes         Strong      Yes
10     Low        Low              No           Weak       No
11  Medium       High              No        Average       No
12    High       High             Yes         Strong      Yes
13     Low        Low             Yes           Weak       No
14  Medium     Medium              No        Average       No
15    Hi

In [64]:
X_admission, y_admission = df_admission.drop(["Admitted"], axis=1), df_admission["Admitted"]
print(X_admission.shape)
print(y_admission)

(20, 4)
0     Yes
1     Yes
2     Yes
3      No
4     Yes
5     Yes
6      No
7     Yes
8     Yes
9     Yes
10     No
11     No
12    Yes
13     No
14     No
15    Yes
16     No
17    Yes
18    Yes
19     No
Name: Admitted, dtype: object


In [73]:
nb_admission = NaiveBayes()
nb_admission.fit(X_admission, y_admission)

print("Train Accuracy: {}".format(accuracy_score(y_admission, nb_admission.predict(X_admission))))

#Query 1:
# GPA Test_Score Extracurricular Recommendation Admitted
query = np.array([['High','High','Yes','Strong']])
print("Query:- {} ---> {}".format(query, nb_admission.predict(query)))

{'No': 0.0, 'Yes': 0.14814814814814814}
{'No': 0.0, 'Yes': 0.07407407407407407}
{'No': 0.0041015625, 'Yes': 0.007407407407407407}
{'No': 0.1025390625, 'Yes': 0.0}
{'No': 0.0, 'Yes': 0.14814814814814814}
{'No': 0.0011718750000000002, 'Yes': 0.018518518518518517}
{'No': 0.1025390625, 'Yes': 0.0}
{'No': 0.0, 'Yes': 0.14814814814814814}
{'No': 0.0041015625, 'Yes': 0.007407407407407407}
{'No': 0.0, 'Yes': 0.07407407407407407}
{'No': 0.1025390625, 'Yes': 0.0}
{'No': 0.0041015625, 'Yes': 0.007407407407407407}
{'No': 0.0, 'Yes': 0.14814814814814814}
{'No': 0.0146484375, 'Yes': 0.0}
{'No': 0.008203125, 'Yes': 0.0037037037037037034}
{'No': 0.0, 'Yes': 0.14814814814814814}
{'No': 0.024609375000000003, 'Yes': 0.0}
{'No': 0.0, 'Yes': 0.14814814814814814}
{'No': 0.0011718750000000002, 'Yes': 0.018518518518518517}
{'No': 0.1025390625, 'Yes': 0.0}
Train Accuracy: 95.0
{'No': 0.0, 'Yes': 0.14814814814814814}
Query:- [['High' 'High' 'Yes' 'Strong']] ---> ['Yes']


In [66]:
nb_admission.pred_priors

{'GPA': {'High': 0.4, 'Low': 0.3, 'Medium': 0.3},
 'Test_Score': {'High': 0.45, 'Low': 0.25, 'Medium': 0.3},
 'Extracurricular': {'No': 0.45, 'Yes': 0.55},
 'Recommendation': {'Average': 0.35, 'Strong': 0.4, 'Weak': 0.25}}

In [67]:
nb_admission.class_priors

{'No': 0.4, 'Yes': 0.6}

In [68]:
nb_admission.likelihoods

{'GPA': {'High_No': 0,
  'High_Yes': 0.6666666666666666,
  'Low_No': 0.75,
  'Low_Yes': 0,
  'Medium_No': 0.25,
  'Medium_Yes': 0.3333333333333333},
 'Test_Score': {'High_No': 0.125,
  'High_Yes': 0.6666666666666666,
  'Low_No': 0.625,
  'Low_Yes': 0,
  'Medium_No': 0.25,
  'Medium_Yes': 0.3333333333333333},
 'Extracurricular': {'No_No': 0.875,
  'No_Yes': 0.16666666666666666,
  'Yes_No': 0.125,
  'Yes_Yes': 0.8333333333333334},
 'Recommendation': {'Average_No': 0.375,
  'Average_Yes': 0.3333333333333333,
  'Strong_No': 0,
  'Strong_Yes': 0.6666666666666666,
  'Weak_No': 0.625,
  'Weak_Yes': 0}}

In [69]:
list(y_admission.unique())

['Yes', 'No']