In [None]:
from read_data import get_training, get_test, get_sparse
from processing import exclude_non_numeric
from utils import create_csv_output
import numpy as np
import pandas as pd
import scipy
import pickle
from scipy.sparse import csr_matrix
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit, StratifiedShuffleSplit
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
np.random.seed(30027)
sns.set(rc={"figure.facecolor": "white"})

In [None]:
from sklearn.naive_bayes import GaussianNB, ComplementNB

## Train data

In [None]:
train_X,train_y = get_training()
train_X_numeric = csr_matrix(FunctionTransformer(np.log1p).fit_transform(exclude_non_numeric(train_X)))
train_name, train_ingr, train_steps = get_sparse(data="train")
train_sparse = scipy.sparse.hstack((train_X_numeric, train_name, train_ingr, train_steps), format="csr")

In [None]:
# Change target from (1.0, 2.0, 3.0) -> (0, 1, 2)
y = (train_y - 1).astype(int)
X_train, X_test, y_train, y_test = train_test_split(train_sparse, y, test_size=0.2)

## Test data

In [None]:
test_X = get_test()
test_X_numeric = csr_matrix(FunctionTransformer(np.log1p).fit_transform(exclude_non_numeric(test_X)))
test_name, test_ingr, test_steps = get_sparse(data="test")
test_sparse = scipy.sparse.hstack((test_X_numeric, test_name, test_ingr, test_steps), format="csr")

## ComplementNB

In [None]:
%%time
partial_CNB = ComplementNB().fit(X_train, y_train)
pickle.dump(partial_CNB, open("report_models/partial/CNB.sav", "wb"))
# CPU times: user 20.3 ms, sys: 3.07 ms, total: 23.4 ms
# Wall time: 22.1 ms


In [None]:
cross_val_score(partial_CNB, X_test, y_test, cv=StratifiedShuffleSplit(n_splits=5, test_size=0.2))
# array([0.724375, 0.7325  , 0.76625 , 0.729375, 0.716875])

### Analysis

In [None]:
CNB_score = partial_CNB.score(X_test, y_test)
CNB_pred = partial_CNB.predict(X_test)
CNB_cm = confusion_matrix(y_test, CNB_pred, normalize='true')

In [None]:
plt.figure(figsize=(5, 5))
sns.heatmap(CNB_cm, annot=True, square=True, xticklabels=[1.0, 2.0, 3.0], yticklabels=[1.0, 2.0, 3.0], cmap="Blues_r")
plt.ylabel('Actual label')
plt.xlabel('Predicted label' + '\n\nAccuracy Score: {0}'.format(CNB_score))
plt.title('ComplementNB Confusion Matrix', size=13)
plt.savefig("report_pics/NaiveBayes/CNB_cm.png")

### Submission model

In [None]:
%%time
CNB = ComplementNB().fit(train_sparse, y)
pickle.dump(CNB, open("report_models/submission/CNB.sav", "wb"))
# CPU times: user 1min 46s, sys: 2.25 s, total: 1min 48s
# Wall time: 1min 51s

In [None]:
create_csv_output("report_submission/CNB", (CNB.predict(test_sparse) + 1).astype(float))