In [None]:
from read_data import get_training, get_test, get_Doc2Vec, get_sparse
from processing import combine_with_vec, exclude_non_numeric
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit
from sklearn.compose import make_column_transformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import xgboost as xgb
import pickle
from scipy.sparse import csr_matrix
import scipy
np.random.seed(30027)

In [None]:
X,y = get_training()
train_name_vec50, train_ingr_vec50, train_steps_vec50 = get_Doc2Vec(data="train", num_features=50)
train_name_vec100, train_ingr_vec100, train_steps_vec100 = get_Doc2Vec(data="train", num_features=100)

### Doc2Vec preparation

In [None]:
# Combine Doc2Vec with 50 features and non-numeric train data
temp_X = X.copy()
temp_X = exclude_non_numeric(temp_X)
temp_train_name_vec50 = train_name_vec50.copy()
temp_train_ingr_vec50 = train_ingr_vec50.copy()
temp_train_steps_vec50 = train_steps_vec50.copy()
X_combined_50: pd.DataFrame = combine_with_vec(temp_X, temp_train_name_vec50, temp_train_ingr_vec50, temp_train_steps_vec50)

In [None]:
X_train_50, X_test_50, y_train_50, y_test_50 = train_test_split(X_combined_50, (y - 1).astype(int), test_size=0.2)

In [None]:
# Combine Doc2Vec with 50 features and non-numeric train data
temp_X = X.copy()
temp_X = exclude_non_numeric(temp_X)
temp_train_name_vec100 = train_name_vec100.copy()
temp_train_ingr_vec100 = train_ingr_vec100.copy()
temp_train_steps_vec100 = train_steps_vec100.copy()
X_combined_100: pd.DataFrame = combine_with_vec(temp_X, temp_train_name_vec100, temp_train_ingr_vec100, temp_train_steps_vec100)

In [None]:
X_train_100, X_test_100, y_train_100, y_test_100 = train_test_split(X_combined_100, (y - 1).astype(int), test_size=0.2)

### Sprase matrix preparation

In [None]:
X_numeric_sparse = csr_matrix(FunctionTransformer(np.log1p).fit_transform(exclude_non_numeric(X)))

In [None]:
train_name_vec, train_ingr_vec, train_steps_vec = get_sparse(data="train")
train_sparse = scipy.sparse.hstack((X_numeric_sparse, train_name_vec, train_ingr_vec, train_steps_vec), format="csr")

In [None]:
X_train_csr, X_test_csr, y_train_csr, y_test_csr = train_test_split(train_sparse, (y - 1).astype(int), test_size=0.2)

## Decision Tree pipelines

In [None]:
DecisionTree_pipeline = make_pipeline(
        make_column_transformer((FunctionTransformer(np.log1p), ["n_ingredients", "n_steps"]), remainder='passthrough'),
    StandardScaler(),
    DecisionTreeClassifier()
)

In [None]:
LGBM_pipeline = make_pipeline(
    make_column_transformer((FunctionTransformer(np.log1p), ["n_ingredients", "n_steps"]), remainder='passthrough'),
    StandardScaler(),
    lgb.LGBMClassifier())

In [None]:
XGB_pipeline = make_pipeline(
    make_column_transformer((FunctionTransformer(np.log1p), ["n_ingredients", "n_steps"]), remainder='passthrough'),
    StandardScaler(),
    xgb.XGBClassifier(use_label_encoder=False)
)

In [None]:
RandomForest_pipeline = make_pipeline(
    make_column_transformer((FunctionTransformer(np.log1p), ["n_ingredients", "n_steps"]), remainder='passthrough'),
    StandardScaler(),
    RandomForestClassifier()
)

### Sparse matrix pipeline

In [None]:
DecisionTree_sparse_pipeline = make_pipeline(
    StandardScaler(with_mean=False),
    DecisionTreeClassifier()
)

In [None]:
LGBM_sparse_pipeline = make_pipeline(
    StandardScaler(with_mean=False),
    lgb.LGBMClassifier()
)

In [None]:
XGB_sparse_pipeline = make_pipeline(
    StandardScaler(with_mean=False),
    xgb.XGBClassifier(use_label_encoder=False)
)

In [None]:
RandomForest_sparse_pipeline = make_pipeline(
    StandardScaler(with_mean=False),
    RandomForestClassifier()
)

## Decision Tree with 50 Doc2Vec feature
Very basic tree

In [None]:
%%time
partial_DecisionTree_50Doc2Vec_model = DecisionTree_pipeline.fit(X_train_50, y_train_50)

In [None]:
cross_val_score(DecisionTree_pipeline, X_test_50, y_test_50, cv=ShuffleSplit(n_splits=5, test_size=0.2))

In [None]:
partial_DecisionTree_50Doc2Vec_model.score(X_test_50, y_test_50)

## LighGBM with 50 Doc2Vec features 

In [None]:
%%time
partial_LGBM_50Doc2Vec_model = LGBM_pipeline.fit(X_train_50, y_train_50)

In [None]:
cross_val_score(LGBM_pipeline, X_test_50, y_test_50, cv=ShuffleSplit(n_splits=5, test_size=0.2))

In [None]:
partial_LGBM_50Doc2Vec_model.score(X_test_50, y_test_50)
# -> 0.73

In [None]:
%%time
LGBM_50Doc2Vec_model = LGBM_pipeline.fit(X_combined_50, (y - 1).astype(int))

In [None]:
pickle.dump(LGBM_50Doc2Vec_model, open("models/LightGBM-50Doc2Vec.sav", "wb"))

## LightGBM with 100 Doc2Vec features

In [None]:
%%time
partial_LGBM_100Doc2Vec_model = LGBM_pipeline.fit(X_train_100, y_train_100)

In [None]:
cross_val_score(LGBM_pipeline, X_test_100, y_test_100, cv=ShuffleSplit(n_splits=5, test_size=0.2))

In [None]:
partial_LGBM_100Doc2Vec_model.score(X_test_100, y_test_100)
# -> 0.7135

In [None]:
%%time
LGBM_100Doc2Vec_model = LGBM_pipeline.fit(X_combined_100, (y - 1).astype(int))

In [None]:
pickle.dump(LGBM_100Doc2Vec_model, open("models/LightGBM-100Doc2Vec.sav", "wb"))

## XGBoost with 50 Doc2Vec features

In [None]:
%%time
partial_XGB_50Doc2Vec_model = XGB_pipeline.fit(X_train_50, y_train_50)

In [None]:
cross_val_score(XGB_pipeline, X_test_50, y_test_50, cv=ShuffleSplit(n_splits=5, test_size=0.2))

In [None]:
partial_XGB_50Doc2Vec_model.score(X_test_50, y_test_50)
# -> 0.727625

In [None]:
XGB_50Doc2Vec_model = XGB_pipeline.fit(X_combined_50, (y-1).astype(int))

In [None]:
pickle.dump(XGB_50Doc2Vec_model, open("models/XGBoost-50Doc2Vec.sav", "wb"))

## RandomForestClassifier with 50 Doc2Vec features

In [None]:
%%time
partial_RandomForest_50Doc2Vec_model = RandomForest_pipeline.fit(X_train_50, y_train_50)

In [None]:
cross_val_score(RandomForest_pipeline, X_test_100, y_test_100, cv=ShuffleSplit(n_splits=5, test_size=0.2))

In [None]:
partial_RandomForest_50Doc2Vec_model.score(X_test_50, y_test_50)

In [None]:
%%time
RandomForest_50Doc2Vec_model = RandomForest_pipeline.fit(X_combined_50, (y-1).astype(int))

In [None]:
pickle.dump(RandomForest_50Doc2Vec_model, open("models/RandomForest-50Doc2Vec.sav", "wb"))

## DecisionTree with Sprase matrix

In [None]:
%%time
partial_DecisionTree_sparse_model = DecisionTree_sparse_pipeline.fit(X_train_csr, y_train_csr)

In [None]:
cross_val_score(partial_DecisionTree_sparse_model, X_test_csr, y_test_csr, cv=ShuffleSplit(n_splits=5, test_size=0.2))

In [None]:
partial_DecisionTree_sparse_model.score(X_test_csr, y_test_csr)

## LightGBM with sparse matrix

In [None]:
%%time
partial_LGBM_sparse_model = LGBM_sparse_pipeline.fit(X_train_csr, y_train_csr)

In [None]:
cross_val_score(partial_LGBM_sparse_model, X_test_csr, y_test_csr, cv=ShuffleSplit(n_splits=5, test_size=0.2))

In [None]:
partial_LGBM_sparse_model.score(X_test_csr, y_test_csr)

In [None]:
LGBM_sparse_model = LGBM_sparse_pipeline.fit(train_sparse, (y-1).astype(int))

In [None]:
pickle.dump(LGBM_sparse_model, open("models/LightGBM-sparse.sav", "wb"))

## XGBoost with sparse matrix

In [None]:
%%%time
partial_XGB_sprase_model = XGB_sparse_pipeline.fit(X_train_csr, y_train_csr)

In [None]:
cross_val_score(partial_XGB_sprase_model, X_test_csr, y_test_csr, cv=ShuffleSplit(n_splits=5, test_size=0.2))

In [None]:
partial_XGB_sprase_model.score(X_test_csr, y_test_csr)

## RandomForest with sparse matrix

In [None]:
%%time
partial_RandomForest_sparse_model = RandomForest_sparse_pipeline.fit(X_train_csr, y_train_csr)

In [None]:
cross_val_score(partial_RandomForest_sparse_model, X_test_csr, y_test_csr, cv=ShuffleSplit(n_splits=5, test_size=0.2))

In [None]:
partial_RandomForest_sparse_model.score(X_test_csr, y_test_csr)