<a href="https://colab.research.google.com/github/NandaAbhilash/CN6005/blob/main/CN6005Week7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import numpy as np
import pandas as pd


In [12]:
def accuracy_score(y_true, y_pred):
    return round(float(sum(y_pred == y_true)) / float(len(y_true)) * 100, 2)

def pre_processing(df):
    X = df.drop([df.columns[-1]], axis=1)
    y = df[df.columns[-1]]
    return X, y


In [13]:
class NaiveBayes:
    def __init__(self):
        self.likelihoods = {}
        self.class_priors = {}
        self.pred_priors = {}
        self.features = []

    def fit(self, X, y):
        self.features = list(X.columns)
        self.X_train = X
        self.y_train = y
        self.train_size = X.shape[0]

        # Initialize dictionaries
        for feature in self.features:
            self.likelihoods[feature] = {}
            self.pred_priors[feature] = {}
            for feat_val in np.unique(self.X_train[feature]):
                self.pred_priors[feature][feat_val] = 0
                for outcome in np.unique(self.y_train):
                    self.likelihoods[feature][feat_val + "_" + outcome] = 0
                    self.class_priors[outcome] = 0

        self._calc_class_prior()
        self._calc_likelihoods()
        self._calc_predictor_prior()

    def _calc_class_prior(self):
        for outcome in np.unique(self.y_train):
            outcome_count = sum(self.y_train == outcome)
            self.class_priors[outcome] = outcome_count / self.train_size

    def _calc_likelihoods(self):
        for feature in self.features:
            for outcome in np.unique(self.y_train):
                outcome_count = sum(self.y_train == outcome)
                feat_likelihood = self.X_train[feature][self.y_train[self.y_train == outcome].index.values.tolist()].value_counts().to_dict()
                for feat_val, count in feat_likelihood.items():
                    self.likelihoods[feature][feat_val + "_" + outcome] = count / outcome_count

    def _calc_predictor_prior(self):
        for feature in self.features:
            feat_vals = self.X_train[feature].value_counts().to_dict()
            for feat_val, count in feat_vals.items():
                self.pred_priors[feature][feat_val] = count / self.train_size

    def predict(self, X):
        results = []
        X = np.array(X)
        for query in X:
            probs_outcome = {}
            for outcome in np.unique(self.y_train):
                prior = self.class_priors[outcome]
                likelihood = 1
                evidence = 1
                for feat, feat_val in zip(self.features, query):
                    likelihood *= self.likelihoods[feat][feat_val + "_" + outcome]
                    evidence *= self.pred_priors[feat][feat_val]
                posterior = (likelihood * prior) / evidence
                probs_outcome[outcome] = posterior
            result = max(probs_outcome, key=lambda x: probs_outcome[x])
            results.append(result)
        return np.array(results)


In [14]:
# Weather dataset embedded
weather_data = {
    "Outlook":    ["Sunny","Sunny","Overcast","Rainy","Rainy","Rainy","Overcast","Sunny","Sunny","Rainy","Sunny","Overcast","Overcast","Rainy"],
    "Temperature":["Hot","Hot","Hot","Mild","Cool","Cool","Cool","Mild","Cool","Mild","Mild","Mild","Hot","Mild"],
    "Humidity":   ["High","High","High","High","Normal","Normal","Normal","High","Normal","Normal","Normal","High","Normal","High"],
    "Windy":      ["False","True","False","False","False","True","True","False","False","False","True","True","False","True"],
    "Play":       ["No","No","Yes","Yes","Yes","No","Yes","No","Yes","Yes","Yes","Yes","Yes","No"]
}
df_weather = pd.DataFrame(weather_data)

X, y = pre_processing(df_weather)
nb_clf = NaiveBayes()
nb_clf.fit(X, y)

print("Train Accuracy:", accuracy_score(y, nb_clf.predict(X)))

# Queries
query1 = np.array([['Rainy','Mild','Normal','True']])
print("Query 1:", query1, "--->", nb_clf.predict(query1))

query2 = np.array([['Overcast','Cool','Normal','True']])
print("Query 2:", query2, "--->", nb_clf.predict(query2))

query3 = np.array([['Sunny','Hot','High','True']])
print("Query 3:", query3, "--->", nb_clf.predict(query3))


Train Accuracy: 92.86
Query 1: [['Rainy' 'Mild' 'Normal' 'True']] ---> ['Yes']
Query 2: [['Overcast' 'Cool' 'Normal' 'True']] ---> ['Yes']
Query 3: [['Sunny' 'Hot' 'High' 'True']] ---> ['No']
