# Thiago de Sousa - 374204 e Gabriel Gomes - 374178

## 1)

In [22]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

In [23]:
class NaiveBayes:
    def __init__(self):
        ''' Default Constructor '''
        self.lEncoder = LabelEncoder()
        self.X = None; self.y = None
        self.classProb = None; self.likeTable = {}
    
    def separateByClass(self):
        ''' This functions separates all the dataset indexing dictionaries by the classes '''
        separated = {}
        for i in range(len(self.y)):
            if (self.y[i] not in separated):
                separated[self.y[i]] = []
            separated[self.y[i]].append(self.X[i])
        return separated
    
    def makeLikeTable(self):
        ''' This functions counts the occurences of each attribute based on the classes
        and construct the Likelihood table (in this case a Dictionary) calculating all
        the propers probabilities '''
        sepClass = self.separateByClass()
        classSizes = [len(sepClass[i]) for i in sepClass.keys()] 
        self.classProb = np.array(classSizes) / sum(classSizes)
        
        self.likeTable = {}
        for label in sepClass.keys():
            aux = np.column_stack(sepClass[label])
            for attribute,idx in zip(aux, range(len(aux))):
                counts = np.asarray(np.unique(attribute, return_counts=True)).T
                for i in range(4):
                    self.likeTable[(label, idx, i)] = 0
                for count_it in counts:
                    self.likeTable[(label, idx, count_it[0])] = count_it[1] / len(sepClass[label])

    def calculateProbability(self, inputVector):
        ''' Utilizes the maximum likelihood estimation to calculate the probabilty of
        each row in inputVector belongs to each possible class '''
        sepClass = self.separateByClass()
        
        probabilities = {}
        for label,_ in sepClass.items():
            probabilities[label] = self.classProb[label]
            for i in range(len(inputVector)):
                probabilities[label] *= self.likeTable[label, i, inputVector[i]]
                
        return probabilities

    def fit(self, X_train, y_train):
        ''' Assign the training data and calls the Likelihood Table creator '''
        self.X = X_train
        self.y = y_train
        
        self.makeLikeTable()
    
    def predict(self, inputArray):
        ''' Return a list of predictions for each row in inputArray correspondent to
        the label of the class with the maximum probability '''
        predictions = []
        for row in inputArray:
            probabilities = self.calculateProbability(row)
            predictions.append(max(probabilities, key=probabilities.get))
        return predictions

## 2)

In [24]:
data = pd.read_csv("carData.csv", header=None)


for i in range(0, data.shape[1]):
    data.iloc[:,i] = LabelEncoder().fit_transform(data.iloc[:,i])

X_train, X_test, y_train, y_test = train_test_split(data.iloc[:,:-1], data.iloc[:,-1], test_size=0.2)

In [25]:
clf = MultinomialNB()
clf.fit(X_train.values, y_train.values)

y_pred = clf.predict(X_test.values)


print("Multinomial Naive Bayes (Versão do Sklearn)")
print("Acurácia Total:: {}%".format(accuracy_score(y_true=y_test, y_pred=y_pred)))

print("\nClassification Report:")
print(classification_report(y_true=y_test, y_pred=y_pred, target_names=["unacc", "acc", "good", "vgood"]))

Multinomial Naive Bayes (Versão do Sklearn)
Acurácia Total:: 0.7109826589595376%

Classification Report:
             precision    recall  f1-score   support

      unacc       1.00      0.04      0.08        75
        acc       0.00      0.00      0.00        16
       good       0.71      1.00      0.83       243
      vgood       0.00      0.00      0.00        12

avg / total       0.71      0.71      0.60       346



  'precision', 'predicted', average, warn_for)


## 3)

In [26]:
nBayes = NaiveBayes()
nBayes.fit(X_train.values, y_train.values)
y_pred = nBayes.predict(X_test.values)

print("Naive Bayes (Nossa Versão)")
print("Acurácia Total: {}%".format(accuracy_score(y_true=y_test, y_pred=y_pred)))

print("\nClassification Report:")
print(classification_report(y_true=y_test, y_pred=y_pred, target_names=["unacc", "acc", "good", "vgood"]))

Naive Bayes (Nossa Versão)
Acurácia Total: 0.838150289017341%

Classification Report:
             precision    recall  f1-score   support

      unacc       0.63      0.68      0.65        75
        acc       0.50      0.19      0.27        16
       good       0.92      0.95      0.93       243
      vgood       0.75      0.50      0.60        12

avg / total       0.83      0.84      0.83       346

