In [4]:
import pandas as pd
import numpy as np

from get_samples import split_data
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import naivebayes

X_train, X_test, y_train, y_test = split_data()
scaler_minmax = MinMaxScaler()
scaler_standard = StandardScaler()
X_train_minmax = scaler_minmax.fit_transform(X_train)
X_test_minmax = scaler_minmax.fit_transform(X_test)
X_train_standard = scaler_standard.fit_transform(X_train)
X_test_standard = scaler_standard.fit_transform(X_test)

print("Running models with outliers present: ")

# Run gaussian naive bayes
y_pred = naivebayes.run_naive_bayes(X_train, X_test, y_train, "gaussian")
y_pred_minmax = naivebayes.run_naive_bayes(X_train_minmax, X_test_minmax, y_train, "gaussian")
y_pred_standard = naivebayes.run_naive_bayes(X_train_standard, X_test_standard, y_train, "gaussian")
acc = naivebayes.accuracy_scorer(y_test, y_pred)
acc_minmax = naivebayes.accuracy_scorer(y_test, y_pred_minmax)
acc_standard = naivebayes.accuracy_scorer(y_test, y_pred_standard)
print(f"Accuracy Gaussian Naive Bayes: {acc}")
print(f"Accuracy Gaussian Naive Bayes min max normalized: {acc_minmax}")
print(f"Accuracy Gaussian Naive Bayes standardized: {acc_standard}")

# Run complement naive bayes

y_pred = naivebayes.run_naive_bayes(X_train, X_test, y_train, "complement")
y_pred_minmax = naivebayes.run_naive_bayes(X_train_minmax, X_test_minmax, y_train, "complement")
acc = naivebayes.accuracy_scorer(y_test, y_pred)
acc_minmax = naivebayes.accuracy_scorer(y_test, y_pred_minmax)
print(f"Accuracy Complement Naive Bayes: {acc}")
print(f"Accuracy Complement Naive Bayes min max normalized: {acc_minmax}")

# Run multinomial naive bayes

y_pred = naivebayes.run_naive_bayes(X_train, X_test, y_train, "multinomial")
y_pred_minmax = naivebayes.run_naive_bayes(X_train_minmax, X_test_minmax, y_train, "multinomial")
acc = naivebayes.accuracy_scorer(y_test, y_pred)
acc_minmax = naivebayes.accuracy_scorer(y_test, y_pred_minmax)
print(f"Accuracy Multinomial Naive Bayes: {acc}")
print(f"Accuracy Multinomial Naive Bayes min max normalized: {acc_minmax}")

# Run bernoulli naive bayes

y_pred = naivebayes.run_naive_bayes(X_train, X_test, y_train, "bernoulli")
y_pred_minmax = naivebayes.run_naive_bayes(X_train_minmax, X_test_minmax, y_train, "bernoulli")
y_pred_standard = naivebayes.run_naive_bayes(X_train_standard, X_test_standard, y_train, "bernoulli")
acc = naivebayes.accuracy_scorer(y_test, y_pred)
acc_minmax = naivebayes.accuracy_scorer(y_test, y_pred_minmax)
acc_standard = naivebayes.accuracy_scorer(y_test, y_pred_standard)
print(f"Accuracy Bernoulli Naive Bayes: {acc}")
print(f"Accuracy Bernoulli Naive Bayes min max normalized: {acc_minmax}")
print(f"Accuracy Bernoulli Naive Bayes standardized: {acc_standard}\n")

# Remove outliers
train_set = np.concatenate([X_train, y_train[:,None]], axis=1) # Add y_train array as last column of X_train 2d array
df = pd.DataFrame(train_set)                                   # Create pandas dataframe of features and labels
length = len(train_set[0]) - 1
for i in range(length):                                        # For each feature in the set, remove each row that has 
    df[i] = df[i].astype(int)                                  # at least one feature that is below the 1-percentile or
    q_u = df[i].quantile(0.99)                                 # it is above the 99th percentile
    q_l = df[i].quantile(0.01)
    df = df[df[i] <= q_u]
    df = df[df[i] >= q_l]

X_train = df.iloc[:, :-1].to_numpy()
y_train = df.iloc[:, -1].to_numpy()
X_train_minmax = scaler_minmax.fit_transform(X_train)
X_test_minmax = scaler_minmax.fit_transform(X_test)
X_train_standard = scaler_standard.fit_transform(X_train)
X_test_standard = scaler_standard.fit_transform(X_test)

print("Running models without outliers present: ")

# Run gaussian naive bayes
y_pred = naivebayes.run_naive_bayes(X_train, X_test, y_train, "gaussian")
y_pred_minmax = naivebayes.run_naive_bayes(X_train_minmax, X_test_minmax, y_train, "gaussian")
y_pred_standard = naivebayes.run_naive_bayes(X_train_standard, X_test_standard, y_train, "gaussian")
acc = naivebayes.accuracy_scorer(y_test, y_pred)
acc_minmax = naivebayes.accuracy_scorer(y_test, y_pred_minmax)
acc_standard = naivebayes.accuracy_scorer(y_test, y_pred_standard)
print(f"Accuracy Gaussian Naive Bayes: {acc}")
print(f"Accuracy Gaussian Naive Bayes min max normalized: {acc_minmax}")
print(f"Accuracy Gaussian Naive Bayes standardized: {acc_standard}")

# Run complement naive bayes without outliers

y_pred = naivebayes.run_naive_bayes(X_train, X_test, y_train, "complement")
y_pred_minmax = naivebayes.run_naive_bayes(X_train_minmax, X_test_minmax, y_train, "complement")
acc = naivebayes.accuracy_scorer(y_test, y_pred)
acc_minmax = naivebayes.accuracy_scorer(y_test, y_pred_minmax)
print(f"Accuracy Complement Naive Bayes: {acc}")
print(f"Accuracy Complement Naive Bayes min max normalized: {acc_minmax}")

# Run multinomial naive bayes without outliers

y_pred = naivebayes.run_naive_bayes(X_train, X_test, y_train, "multinomial")
y_pred_minmax = naivebayes.run_naive_bayes(X_train_minmax, X_test_minmax, y_train, "multinomial")
acc = naivebayes.accuracy_scorer(y_test, y_pred)
acc_minmax = naivebayes.accuracy_scorer(y_test, y_pred_minmax)
print(f"Accuracy Multinomial Naive Bayes: {acc}")
print(f"Accuracy Multinomial Naive Bayes min max normalized: {acc_minmax}")

# Run bernoulli naive bayes without outliers

y_pred = naivebayes.run_naive_bayes(X_train, X_test, y_train, "bernoulli")
y_pred_minmax = naivebayes.run_naive_bayes(X_train_minmax, X_test_minmax, y_train, "bernoulli")
y_pred_standard = naivebayes.run_naive_bayes(X_train_standard, X_test_standard, y_train, "bernoulli")
acc = naivebayes.accuracy_scorer(y_test, y_pred)
acc_minmax = naivebayes.accuracy_scorer(y_test, y_pred_minmax)
acc_standard = naivebayes.accuracy_scorer(y_test, y_pred_standard)
print(f"Accuracy Bernoulli Naive Bayes: {acc}")
print(f"Accuracy Bernoulli Naive Bayes min max normalized: {acc_minmax}")
print(f"Accuracy Bernoulli Naive Bayes standardized: {acc_standard}")

Running models with outliers present: 
Accuracy Gaussian Naive Bayes: 0.6336571607646506
Accuracy Gaussian Naive Bayes min max normalized: 0.6217486681291131
Accuracy Gaussian Naive Bayes standardized: 0.5988718270134754
Accuracy Complement Naive Bayes: 0.4779066123472266
Accuracy Complement Naive Bayes min max normalized: 0.4882481980570354
Accuracy Multinomial Naive Bayes: 0.6396114070824193
Accuracy Multinomial Naive Bayes min max normalized: 0.594797869006581
Accuracy Bernoulli Naive Bayes: 0.22250078345346286
Accuracy Bernoulli Naive Bayes min max normalized: 0.23252898777812597
Accuracy Bernoulli Naive Bayes standardized: 0.6126606079598872

Running models without outliers present: 
Accuracy Gaussian Naive Bayes: 0.6452522720150423
Accuracy Gaussian Naive Bayes min max normalized: 0.6223754308994046
Accuracy Gaussian Naive Bayes standardized: 0.6126606079598872
Accuracy Complement Naive Bayes: 0.5578188655593858
Accuracy Complement Naive Bayes min max normalized: 0.55656534001880