In [23]:
from read_data import get_training, get_test, get_Doc2Vec, get_data
from processing import combine_with_vec, exclude_non_numeric
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import xgboost as xgb
import pickle
np.random.seed(30027)

In [2]:
X,y = get_training()
train_name_vec50, train_ingr_vec50, train_steps_vec50 = get_Doc2Vec(data="train", num_features=50)
train_name_vec100, train_ingr_vec100, train_steps_vec100 = get_Doc2Vec(data="train", num_features=100)

### Doc2Vec preparation

In [3]:
# Combine Doc2Vec with 50 features and non-numeric train data
temp_X = X.copy()
temp_X = exclude_non_numeric(temp_X)
temp_train_name_vec50 = train_name_vec50.copy()
temp_train_ingr_vec50 = train_ingr_vec50.copy()
temp_train_steps_vec50 = train_steps_vec50.copy()
X_combined_50: pd.DataFrame = combine_with_vec(temp_X, temp_train_name_vec50, temp_train_ingr_vec50, temp_train_steps_vec50)

In [4]:
X_train_50, X_test_50, y_train_50, y_test_50 = train_test_split(X_combined_50, (y - 1).astype(int), test_size=0.2)

In [5]:
# Combine Doc2Vec with 50 features and non-numeric train data
temp_X = X.copy()
temp_X = exclude_non_numeric(temp_X)
temp_train_name_vec100 = train_name_vec100.copy()
temp_train_ingr_vec100 = train_ingr_vec100.copy()
temp_train_steps_vec100 = train_steps_vec100.copy()
X_combined_100: pd.DataFrame = combine_with_vec(temp_X, temp_train_name_vec100, temp_train_ingr_vec100, temp_train_steps_vec100)

In [6]:
X_train_100, X_test_100, y_train_100, y_test_100 = train_test_split(X_combined_100, (y - 1).astype(int), test_size=0.2)

## Decision Tree pipelines

In [7]:
LGBM_pipeline = make_pipeline(
    make_column_transformer((FunctionTransformer(np.log1p), ["n_ingredients", "n_steps"]), remainder='passthrough'),
    StandardScaler(),
    lgb.LGBMClassifier())

In [10]:
XGB_pipeline = make_pipeline(
    make_column_transformer((FunctionTransformer(np.log1p), ["n_ingredients", "n_steps"]), remainder='passthrough'),
    StandardScaler(),
    xgb.XGBClassifier(use_label_encoder=False)
)

In [24]:
RandomForest_pipeline = make_pipeline(
    make_column_transformer((FunctionTransformer(np.log1p), ["n_ingredients", "n_steps"]), remainder='passthrough'),
    StandardScaler(),
    RandomForestClassifier()
)

## LighGBM with 50 Doc2Vec features 

In [11]:
%%time
partial_LGBM_50Doc2Vec_model = LGBM_pipeline.fit(X_train_50, y_train_50)

CPU times: user 41.5 s, sys: 2.31 s, total: 43.8 s
Wall time: 5.23 s


In [12]:
cross_val_score(LGBM_pipeline, X_test_50, y_test_50, cv=ShuffleSplit(n_splits=5, test_size=0.2))

array([0.71875 , 0.730625, 0.721875, 0.685625, 0.7025  ])

In [13]:
partial_LGBM_50Doc2Vec_model.score(X_test_50, y_test_50)
# -> 0.73

0.73

In [15]:
%%time
LGBM_50Doc2Vec_model = LGBM_pipeline.fit(X_combined_50, (y - 1).astype(int))

CPU times: user 51.2 s, sys: 3.37 s, total: 54.6 s
Wall time: 7.36 s


In [16]:
pickle.dump(LGBM_50Doc2Vec_model, open("models/LightGBM-50Doc2Vec.sav", "wb"))

## LightGBM with 100 Doc2Vec features

In [17]:
%%time
partial_LGBM_100Doc2Vec_model = LGBM_pipeline.fit(X_train_100, y_train_100)

CPU times: user 1min 24s, sys: 4.95 s, total: 1min 29s
Wall time: 12.7 s


In [18]:
cross_val_score(LGBM_pipeline, X_test_100, y_test_100, cv=ShuffleSplit(n_splits=5, test_size=0.2))

array([0.720625, 0.713125, 0.706875, 0.689375, 0.704375])

In [19]:
partial_LGBM_100Doc2Vec_model.score(X_test_100, y_test_100)
# -> 0.7135

0.7135

In [21]:
%%time
LGBM_100Doc2Vec_model = LGBM_pipeline.fit(X_combined_100, (y - 1).astype(int))

CPU times: user 1min 36s, sys: 5.3 s, total: 1min 41s
Wall time: 13.7 s


In [22]:
pickle.dump(LGBM_100Doc2Vec_model, open("models/LightGBM-100Doc2Vec.sav", "wb"))

## XGBoost with 50 Doc2Vec features

In [43]:
%%time
partial_XGB_50Doc2Vec_model = XGB_pipeline.fit(X_train_50, y_train_50)

CPU times: user 9min 48s, sys: 7.5 s, total: 9min 55s
Wall time: 1min 9s


In [44]:
cross_val_score(XGB_pipeline, X_test_50, y_test_50, cv=ShuffleSplit(n_splits=5, test_size=0.2))



array([0.691875, 0.71375 , 0.709375, 0.6925  , 0.685625])

In [11]:
partial_XGB_50Doc2Vec_model.score(X_test_50, y_test_50)
# -> 0.727625



0.727625

In [45]:
XGB_50Doc2Vec_model = XGB_pipeline.fit(X_combined_50, (y-1).astype(int))



In [46]:
pickle.dump(XGB_50Doc2Vec_model, open("models/XGBoost-50Doc2Vec.sav", "wb"))

## RandomForestClassifier with 50 Doc2Vec features

In [25]:
%%time
partial_RandomForest_50Doc2Vec_model = RandomForest_pipeline.fit(X_train_50, y_train_50)

CPU times: user 46.9 s, sys: 746 ms, total: 47.7 s
Wall time: 48.5 s


In [26]:
cross_val_score(RandomForest_pipeline, X_test_100, y_test_100, cv=ShuffleSplit(n_splits=5, test_size=0.2))

array([0.655   , 0.665   , 0.66    , 0.67125 , 0.676875])

In [27]:
partial_RandomForest_50Doc2Vec_model.score(X_test_50, y_test_50)

0.704375

In [28]:
%%time
RandomForest_50Doc2Vec_model = RandomForest_pipeline.fit(X_combined_50, (y-1).astype(int))

CPU times: user 1min 2s, sys: 1.03 s, total: 1min 3s
Wall time: 1min 4s


In [29]:
pickle.dump(RandomForest_50Doc2Vec_model, open("models/RandomForest-50Doc2Vec.sav", "wb"))