In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load the data
data = pd.read_csv('mushrooms.csv')

# Preprocess the data
X = data.iloc[:, 1:]
y = data.iloc[:, 0]
X = pd.get_dummies(X, columns=X.columns)
X = X.values
y = np.array([1 if label == 'p' else 0 for label in y])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Naive Bayes classifier
class_counts = np.bincount(y_train)
priors = class_counts / float(len(y_train))
likelihoods = np.zeros((2, X.shape[1], 2))
for i in range(X.shape[1]):
    for j in range(2):
        feature_counts = np.bincount(X_train[y_train == j, i], minlength=2)
        likelihoods[j, i, :] = feature_counts / float(class_counts[j])

# Test the classifier on training data
y_train_pred = np.zeros_like(y_train)
for i in range(len(y_train)):
    p_edible = priors[0]
    p_poisonous = priors[1]
    for j in range(X.shape[1]):
        p_edible *= likelihoods[0, j, X_train[i, j]]
        p_poisonous *= likelihoods[1, j, X_train[i, j]]
    if p_edible > p_poisonous:
        y_train_pred[i] = 0
    else:
        y_train_pred[i] = 1
train_accuracy = np.mean(y_train_pred == y_train)
print('Training accuracy:', train_accuracy)

# Test the classifier on testing data
y_test_pred = np.zeros_like(y_test)
for i in range(len(y_test)):
    p_edible = priors[0]
    p_poisonous = priors[1]
    for j in range(X.shape[1]):
        p_edible *= likelihoods[0, j, X_test[i, j]]
        p_poisonous *= likelihoods[1, j, X_test[i, j]]
    if p_edible > p_poisonous:
        y_test_pred[i] = 0
    else:
        y_test_pred[i] = 1
test_accuracy = np.mean(y_test_pred == y_test)
print('Testing accuracy:', test_accuracy)



Training accuracy: 0.9943068164332974
Testing accuracy: 0.9938461538461538


In [2]:


# define a function to tokenize a string
def tokenize_text(text):
    tokens = nltk.word_tokenize(text)
    return tokens

# apply the function to the 'text' column using apply()
df['tokens'] = df['text'].apply(tokenize_text)

# print the

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [1]:
import pandas as pd
import numpy as np

# create a function to split the data into training and testing sets
def split_data(data, train_frac):
    np.random.shuffle(data)
    train_size = int(train_frac * len(data))
    train_data = data[:train_size]
    test_data = data[train_size:]
    return train_data, test_data

# create a function to calculate probabilities
def calculate_probabilities(data):
    classes, class_counts = np.unique(data[:, -1], return_counts=True)
    n = len(data)
    probabilities = {}
    for c in classes:
        probabilities[c] = {}
        class_data = data[data[:, -1] == c]
        class_n = len(class_data)
        for i in range(data.shape[1] - 1):
            attr_values, attr_counts = np.unique(class_data[:, i], return_counts=True)
            for j, v in enumerate(attr_values):
                probabilities[c][(i, v)] = (attr_counts[j] + 1) / (class_n + len(attr_values))
        probabilities[c]['class_prob'] = class_n / n
    return probabilities

# create a function to predict the class
def predict_class(d, probs):
    class_probs = {}
    for c in probs:
        class_probs[c] = probs[c]['class_prob']
        for i, v in enumerate(d[:-1]):
            if (i, v) in probs[c]:
                class_probs[c] *= probs[c][(i, v)]
            else:
                class_probs[c] *= 1 / (len(np.unique(training_data[:, i])) + 1)
    return max(class_probs, key=class_probs.get)

# load the data
data = pd.read_csv('data_csv.csv').values

# split the data
training_data, testing_data = split_data(data, 0.8)

# calculate probabilities
probabilities = calculate_probabilities(training_data)

# make predictions
predictions = []
for d in testing_data:
    c = predict_class(d, probabilities)
    predictions.append(c)

# calculate accuracy
accuracy = sum(predictions == testing_data[:, -1]) / len(testing_data)
print('Accuracy:', accuracy)


Accuracy: 0.5513846153846154
