# Naive Bayes Modelling

# Author - Kevin Abraham

 Using the Car dataset Data sets. The datasets  evaluates cars according to the following six input
   attributes: buying, maint, doors, persons, lug_boot, safety

### Import required python packages

In [138]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
%matplotlib notebook

### 1. Load dataset & use Gaussian Naive Bayes for model prediction

In [313]:
path= 'C:/Users/Owner/Desktop/BDA Exam 102 Fall/BDA Exam 102 Fall/NaiveBayes/car.data'

In [314]:
#you should have seven columns (without column column headers)
features = ['buying', 'maint', 'doors', 'persons','lug_boot','safety','car_evaluation']
cardata = pd.read_csv(path, names = features, sep=',', header=None) #, index_col = 0, low_memory=False)


In [315]:
cardata.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,car_evaluation
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


### Analyze and prepare data for model prediction

In [316]:
cardata.dtypes

buying            object
maint             object
doors             object
persons           object
lug_boot          object
safety            object
car_evaluation    object
dtype: object

In [317]:
pd.value_counts(cardata.car_evaluation)

unacc    1209
acc       384
good       67
vgood      64
Name: car_evaluation, dtype: int64

### Check for Missing values (if any)

In [318]:
# Count missing values for each column (if any)
null_columns=cardata.columns[cardata.isnull().any()]
cardata[null_columns].isnull().sum() 

Series([], dtype: float64)

### Convert categorical to numerical values using 'cat.codes'

#### * Alternatively we could use LabelEncoder() or get_dummies() also

In [319]:
for col in set(cardata.columns):
    cardata[col] = cardata[col].astype('category')

In [320]:
cardata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1724 entries, 0 to 1723
Data columns (total 7 columns):
buying            1724 non-null category
maint             1724 non-null category
doors             1724 non-null category
persons           1724 non-null category
lug_boot          1724 non-null category
safety            1724 non-null category
car_evaluation    1724 non-null category
dtypes: category(7)
memory usage: 12.9 KB


In [321]:
for col in set(cardata.columns):
     cardata[col] = cardata[col].cat.codes

cardata.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,car_evaluation
0,3,3,0,0,2,1,2
1,3,3,0,0,2,2,2
2,3,3,0,0,2,0,2
3,3,3,0,0,1,1,2
4,3,3,0,0,1,2,2


In [322]:
cardata.tail()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,car_evaluation
1719,1,1,3,2,2,1,2
1720,1,1,3,2,2,2,0
1721,1,1,3,2,0,1,2
1722,1,1,3,2,0,2,1
1723,1,1,3,2,0,0,3


In [323]:
len(cardata)

1724

### Build Gaussian Naive Bayes model for predicting 'car_evaluation' target 

Catergorical-to-Numeric Codes: 

- **buying**:       vhigh (3), high (0), med (2), low (1).
- **maint**:        vhigh (3), high (0), med (2), low (1).
- **doors**:        2 (0), 3 (1), 4 (2), 5more (3).
- **persons**:      2 (0), 4 (1), more (2).
- **lug_boot**:     small (2), med (1), big (0).
- **safety**:       low (1), med (2), high (0).
- **car_evaluation**: unacc (2), acc (0), good (1), vgood (3)

In [324]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn import model_selection
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB 

### Split Train/Test 80-20

In [325]:
X= cardata.drop('car_evaluation', axis=1)
y= cardata['car_evaluation']

In [326]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=50)

### Gaussian Naive Bayes

In [327]:
algorithm_a = GaussianNB()

In [328]:
algorithm_a.fit(X_train, y_train)
prediction = algorithm_a.predict(X_test)

In [329]:
len(X_test)

345

### GaussianNB model accuracy score:

In [330]:
metrics.accuracy_score(y_test, prediction)

0.6695652173913044

### Accuracy using cross_validation:

In [335]:
results = model_selection.cross_val_score(algorithm_a, X, y, cv=15)

In [336]:
print(results.mean())

0.6125298458100199


## 2. Evaluating test cases 1, 2, 3, 4

Catergorical-to-Numeric Codes: 

- **buying**:       vhigh (3), high (0), med (2), low (1).
- **maint**:        vhigh (3), high (0), med (2), low (1).
- **doors**:        2 (0), 3 (1), 4 (2), 5more (3).
- **persons**:      2 (0), 4 (1), more (2).
- **lug_boot**:     small (2), med (1), big (0).
- **safety**:       low (1), med (2), high (0).
- **car_evaluation**: unacc (2), acc (0), good (1), vgood (3)

#### Converted each case (4 cases) to its respecitive numerical code into a new .data file called **'test_cases.data'**

In [337]:
cardata.car_evaluation.unique()

array([2, 0, 3, 1], dtype=int64)

In [338]:
test_path= 'C:/Users/Owner/Desktop/BDA Exam 102 Fall/BDA Exam 102 Fall/NaiveBayes/test_cases.data'

In [339]:
#you should have seven columns (without column column headers)
features = ['buying', 'maint', 'doors', 'persons','lug_boot','safety']
cardata_test = pd.read_csv(test_path, names = features, sep=',', header=None) #, index_col = 0, low_memory=False)


In [340]:
cardata_test

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,1,1,3,2,2,0
1,1,1,3,2,1,1
2,1,1,3,2,1,0
3,1,1,3,2,1,0


### Re-run gaussian model for test cases 

In [341]:
prediction_new = algorithm_a.predict(cardata_test)

In [342]:
prediction_new

array([3, 0, 3, 3], dtype=int8)

### Results of Model Predictions:
#### For the given case criterias:

- **Case1**: 3 -> Vgood
- **Case2**: 0 -> Acc
- **Case3**: 3 -> Vgood
- **Case4**: 3 -> Vgood

1- Provide a model based on Gaussian Naive bayes and calculate the accuracy using cross validation.<br>
2- What is the class attribute(evaluation) for the following cases?

In [344]:
Case1=     ["low","low","5more","more","small","high"] # vgood
Case2=     ["low","low","5more","more","med","low"]    # acc
Case3=     ["low","low","5more","more","med","high"]   # vgood
Case4=     ["low","low","5more","more","med","high"]   # vgood