In [53]:
## SKLEARN
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB, MultinomialNB
from sklearn import metrics

import numpy as np

np.set_printoptions(suppress=True)
import pandas as pd
from platform import python_version

import random

# works, sort of only. Possible additional commas that shouldn't be there.
%load_ext nb_black

print("Libs imported. Python version is: " + python_version())

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black
Libs imported. Python version is: 3.9.7


<IPython.core.display.Javascript object>

In [54]:
# utility functions


def load_dataset(x):
    return {
        "a": load_car(),
        "b": load_mushroom(),
        "c": load_audiology(),
        "d": load_wine(),
    }[x]


"""
https://archive.ics.uci.edu/ml/datasets/car+evaluation
0-5 -> data
6 -> labels
"""


def load_car():
    df_car = pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data",
        header=None,
    )
    X = df_car.loc[:, :5].values
    y = df_car.loc[:, 6].values
    return (X, y)


"""
https://archive.ics.uci.edu/ml/datasets/mushroom
1-22 -> data
0 -> labels
"""


def load_mushroom():
    df_mushroom = pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data",
        header=None,
    )
    X = df_mushroom.loc[:, 1:].values
    y = df_mushroom.loc[:, 0].values
    return (X, y)


"""
https://archive.ics.uci.edu/ml/datasets/Audiology+%28Standardized%29
0:length-2 -> data
length-1 unique id (p1-p200)
length -> labels
"""


def load_audiology():
    df_audiology = pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/audiology/audiology.standardized.data",
        header=None,
    )
    length = len(df_audiology.columns)
    X = df_audiology.loc[:, : length - 3].values
    y = df_audiology.loc[:, length - 1].values
    return (X, y)


"""
https://www.alldatascience.com/classification/wine-dataset-analysis-with-python/
1:length -> data
0 -> labels
"""


def load_wine():
    df_wine = pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data",
        header=None,
    )
    length = len(df_wine.columns)
    X = df_wine.loc[:, 1:].values
    y = df_wine.loc[:, 0].values
    return (X, y)

<IPython.core.display.Javascript object>

In [55]:
dataset = load_dataset("a")  # CAR
# dataset = load_dataset("b")  # MUSHROOM
# dataset = load_dataset("c")  # AUDIOLOGY
# dataset = load_dataset("d") # WINE

# Extract to X and y
X_cat = dataset[0]
y_cat = dataset[1]

print("Sizes of X and y respectively:")
print(np.shape(X_cat))
print(np.shape(y_cat))

Sizes of X and y respectively:
(1728, 6)
(1728,)


<IPython.core.display.Javascript object>

In [56]:
"""
Old code when I tried to encode data as well not just labels
def encode_array(arg_array, shape_to_resize):
    result = []
    for column in arg_array.transpose():
        result.append(encoder.fit_transform(column))
    result = np.reshape(result, shape_to_resize)
    return result.transpose()


def decode_array(arg_array):
    shape_to_resize = np.shape(arg_array.transpose())
    result = []
    for column in arg_array.transpose():
        result.append(encoder.inverse_transform(column))
    result = np.reshape(result, shape_to_resize)
    return result.transpose()


# Make a full dictionary of categories per column and encode
encoder = preprocessing.LabelEncoder()
X = []

# Encoding the data
X = encode_array(X_cat, np.shape(X_cat.transpose()))

# Encoding the labels
y = encoder.fit_transform(y_cat)
# Check if decoding works
decoded = decode_array(X)
print(decoded)
print("meanwhile X_cat: ")
print(X_cat)
print(np.array_equiv(decoded, X_cat))
print(np.array_equiv(encoder.inverse_transform(y), y_cat))
"""
encoder = preprocessing.LabelEncoder()
# Encoding the labels
y = encoder.fit_transform(y_cat)
# Check if decoding works
print(
    "Encode-decode equivalence check: "
    + str(np.array_equiv(encoder.inverse_transform(y), y_cat))
)

Encode-decode equivalence check: True


<IPython.core.display.Javascript object>

In [58]:
max_seed_val = 2 ** 32 - 1

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=random.randrange(0, max_seed_val)
)
print("Data has been split.")

Data has been split.


<IPython.core.display.Javascript object>

In [59]:
# Create a Gaussian Classifier
gnb = MultinomialNB()

# Train the model using the training sets
gnb.fit(X_train, y_train)

# Predict the response for test dataset
y_pred = gnb.predict(X_test)

<IPython.core.display.Javascript object>

In [60]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.6736111111111112


<IPython.core.display.Javascript object>

In [None]:
"""
links:

https://dev.to/codinghappinessweb/analysing-dataset-using-naive-bayes-classifier-3d7o
https://towardsdatascience.com/15-tips-and-tricks-for-jupyter-notebook-that-will-ease-your-coding-experience-e469207ac95c
https://towardsdatascience.com/guide-to-encoding-categorical-features-using-scikit-learn-for-machine-learning-5048997a5c79
https://towardsdatascience.com/naive-bayes-classifier-how-to-successfully-use-it-in-python-ecf76a995069
https://scikit-learn.org/stable/modules/naive_bayes.html#multinomial-naive-bayes
"""