In [30]:
from read_data import get_training, get_Doc2Vec
from processing import  combine_with_vec, exclude_non_numeric
from sklearn.naive_bayes import MultinomialNB, ComplementNB, GaussianNB
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, FunctionTransformer
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit
from sklearn.compose import make_column_transformer
import pickle
np.random.seed(30027)

In [12]:
# Training data
X, y = get_training()
train_name_vec50, train_ingr_vec50, train_steps_vec50 = get_Doc2Vec(data="train", num_features=50)
train_name_vec100, train_ingr_vec100, train_steps_vec100 = get_Doc2Vec(data="train", num_features=100)

In [13]:
# Combine Doc2Vec with 50 features and non-numeric train data
temp_X = X.copy()
temp_X = exclude_non_numeric(temp_X)
temp_train_name_vec50 = train_name_vec50.copy()
temp_train_ingr_vec50 = train_ingr_vec50.copy()
temp_train_steps_vec50 = train_steps_vec50.copy()
X_combined_50: pd.DataFrame = combine_with_vec(temp_X, temp_train_name_vec50, temp_train_ingr_vec50, temp_train_steps_vec50)

In [14]:
X_train_50, X_test_50, y_train_50, y_test_50 = train_test_split(X_combined_50, y, test_size=0.2)

In [15]:
# Combine Doc2Vec with 50 features and non-numeric train data
temp_X = X.copy()
temp_X = exclude_non_numeric(temp_X)
temp_train_name_vec100 = train_name_vec100.copy()
temp_train_ingr_vec100 = train_ingr_vec100.copy()
temp_train_steps_vec100 = train_steps_vec100.copy()
X_combined_100: pd.DataFrame = combine_with_vec(temp_X, temp_train_name_vec100, temp_train_ingr_vec100, temp_train_steps_vec100)

In [16]:
X_train_100, X_test_100, y_train_100, y_test_100 = train_test_split(X_combined_100, y, test_size=0.2)

## Naïve Bayes Classifier pipelines

In [5]:
# with GaussianNB
GaussianNB_pipeline = make_pipeline(
    make_column_transformer((FunctionTransformer(np.log1p), ["n_ingredients", "n_steps"]), remainder='passthrough'),
    StandardScaler(),
    GaussianNB()
    )

In [37]:
ComplementNB_pipeline = make_pipeline(
    make_column_transformer((FunctionTransformer(np.log1p), ["n_ingredients", "n_steps"]), remainder='passthrough'),
    MinMaxScaler(),
    ComplementNB()
    )

## GaussianNB with Doc2Vec 50 features

In [6]:
%%time
partial_GuassianNB_50Doc2Vec_model = GaussianNB_pipeline.fit(X_train_50, y_train_50)

CPU times: user 202 ms, sys: 75.2 ms, total: 277 ms
Wall time: 280 ms


In [24]:
# Cross validation
cross_val_score(GaussianNB_pipeline, X_test_50, y_test_50, cv=ShuffleSplit(n_splits=5, test_size=0.2))

array([0.641875, 0.644375, 0.645   , 0.663125, 0.64375 ])

In [7]:
partial_GuassianNB_50Doc2Vec_model.score(X_test_50, y_test_50)
# -> 0.632875

0.632875

In [8]:
GuassianNB_50Doc2Vec_model = GaussianNB_pipeline.fit(X_combined_50, y)

In [9]:
pickle.dump(GuassianNB_50Doc2Vec_model, open("models/GaussianNB-50Doc2Vec.sav", "wb"))

## GaussianNB with Doc2Vec 100 features

In [17]:
%%time
partial_GuassianNB_100Doc2Vec_model = GaussianNB_pipeline.fit(X_train_100, y_train_100)

CPU times: user 441 ms, sys: 166 ms, total: 607 ms
Wall time: 632 ms


In [25]:
cross_val_score(GaussianNB_pipeline, X_test_100, y_test_100, cv=ShuffleSplit(n_splits=5, test_size=0.2))

array([0.610625, 0.60375 , 0.60625 , 0.611875, 0.59125 ])

In [18]:
partial_GuassianNB_100Doc2Vec_model.score(X_test_100, y_test_100)
# -> 0.597

0.597

In [26]:
GuassianNB_100Doc2Vec_model = GaussianNB_pipeline.fit(X_combined_100, y)

In [27]:
pickle.dump(GuassianNB_100Doc2Vec_model, open("models/GaussianNB-100Doc2Vec.sav", "wb"))

## ComplementNB with 50 Doc2Vec features

In [38]:
%%time
partial_ComplementNB_50Doc2Vec_model = ComplementNB_pipeline.fit(X_train_50, y_train_50)

CPU times: user 153 ms, sys: 37.1 ms, total: 191 ms
Wall time: 145 ms


In [42]:
cross_val_score(ComplementNB_pipeline, X_test_50, y_test_50, cv=ShuffleSplit(n_splits=5, test_size=0.2))

array([0.69    , 0.674375, 0.681875, 0.685625, 0.690625])

In [39]:
partial_ComplementNB_50Doc2Vec_model.score(X_test_50, y_test_50)

0.694

In [40]:
ComplementNB_50Doc2Vec_model = ComplementNB_pipeline.fit(X_combined_50, y)

In [41]:
pickle.dump(ComplementNB_50Doc2Vec_model, open("models/ComplementNB-50Doc2Vec.sav", "wb"))

## ComplementNB with 100 Doc2Vec features

In [45]:
%%time
partial_ComplementNB_100Doc2Vec_model = ComplementNB_pipeline.fit(X_train_100, y_train_100)

CPU times: user 254 ms, sys: 43.9 ms, total: 298 ms
Wall time: 230 ms


In [46]:
cross_val_score(ComplementNB_pipeline, X_test_100, y_test_100, cv=ShuffleSplit(n_splits=5, test_size=0.2))

array([0.668125, 0.671875, 0.688125, 0.679375, 0.705   ])

In [47]:
partial_ComplementNB_100Doc2Vec_model.score(X_test_100, y_test_100)

0.687

In [48]:
ComplementNB_100Doc2Vec_model = ComplementNB_pipeline.fit(X_combined_100, y)

In [50]:
pickle.dump(ComplementNB_100Doc2Vec_model, open("models/ComplementNB-100Doc2Vec.sav", "wb"))