In [1]:
from read_data import get_training, get_Doc2Vec
from processing import combine_with_vec, exclude_non_numeric
from sklearn.svm import SVC, LinearSVC
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
import pickle

np.random.seed(30027)

In [2]:
# Training data
X, y = get_training()
train_name_vec50, train_ingr_vec50, train_steps_vec50 = get_Doc2Vec(data="train", num_features=50)
train_name_vec100, train_ingr_vec100, train_steps_vec100 = get_Doc2Vec(data="train", num_features=100)

# SVM pipelines
difference in models: 
https://stackoverflow.com/questions/27912872/what-is-the-difference-between-svc-and-svm-in-scikit-learn

In [3]:
SVC_RBF_pipeline = make_pipeline(
    make_column_transformer((FunctionTransformer(np.log1p), ["n_ingredients", "n_steps"]), remainder='passthrough'),
    StandardScaler(),
    SVC(kernel='rbf', gamma='auto', C=1.0)
)

In [4]:
# LinearSVC
LinearSVC_pipeline = make_pipeline(
    make_column_transformer((FunctionTransformer(np.log1p), ["n_ingredients", "n_steps"]), remainder='passthrough'),
    StandardScaler(),
    LinearSVC(C=1.0, max_iter=10000)
)

### Doc2Vec preparation

In [5]:
# Combine Doc2Vec with 50 features and non-numeric train data
temp_X = X.copy()
temp_X = exclude_non_numeric(temp_X)
temp_train_name_vec50 = train_name_vec50.copy()
temp_train_ingr_vec50 = train_ingr_vec50.copy()
temp_train_steps_vec50 = train_steps_vec50.copy()
X_combined_50: pd.DataFrame = combine_with_vec(temp_X, temp_train_name_vec50, temp_train_ingr_vec50, temp_train_steps_vec50)

In [6]:
X_train_50, X_test_50, y_train_50, y_test_50 = train_test_split(X_combined_50, y, test_size=0.2)

In [7]:
# Combine Doc2Vec with 50 features and non-numeric train data
temp_X = X.copy()
temp_X = exclude_non_numeric(temp_X)
temp_train_name_vec100 = train_name_vec100.copy()
temp_train_ingr_vec100 = train_ingr_vec100.copy()
temp_train_steps_vec100 = train_steps_vec100.copy()
X_combined_100: pd.DataFrame = combine_with_vec(temp_X, temp_train_name_vec100, temp_train_ingr_vec100, temp_train_steps_vec100)

In [23]:
X_train_100, X_test_100, y_train_100, y_test_100 = train_test_split(X_combined_100, y, test_size=0.2)

## SVM with RBF kernel and Doc2Vec with 50 features
Warning: takes 5min to run below code

In [9]:
%%time
partial_SVC_RBF_50Doc2Vec_model = SVC_RBF_pipeline.fit(X_train_50, y_train_50)

CPU times: user 6min 51s, sys: 5.17 s, total: 6min 56s
Wall time: 7min 1s


Above model training time
- CPU times: user 4min 32s, sys: 4.57 s, total: 4min 37s
- Wall time: 4min 49s

In [10]:
# Score
partial_SVC_RBF_50Doc2Vec_model.score(X_test_50, y_test_50)
# -> 0.7095

0.744875

### Model with all train data
Warning: Takes long time!!!!!

In [11]:
%%time
SVC_RBF_50Doc2Vec_model = SVC_RBF_pipeline.fit(X_combined_50, y)

CPU times: user 11min 34s, sys: 6.7 s, total: 11min 41s
Wall time: 11min 48s


Above model training time
- CPU times: user 6min 57s, sys: 6.75 s, total: 7min 3s
- Wall time: 7min 19s

In [12]:
# save model
pickle.dump(SVC_RBF_50Doc2Vec_model, open("models/SVC-RBF-50Doc2Vec.sav", "wb"))

## SVM with RBF kernel and Doc2Vec with 100 features

Warning: below code take loooong time to excute!!!

In [13]:
%%time
partial_SVC_RBF_100Doc2Vec_model = SVC_RBF_pipeline.fit(X_train_100, y_train_100)

CPU times: user 14min 5s, sys: 7.83 s, total: 14min 12s
Wall time: 18min 27s


Above model training time
- CPU times: user 8min 42s, sys: 9.23 s, total: 8min 51s
- Wall time: 9min 15s

In [14]:
# Score
partial_SVC_RBF_100Doc2Vec_model.score(X_test_100, y_test_100)
# -> 0.707375 Score decreased lol

0.737125

### Model with all train data
Warning: take very loooooong time!!!!

In [15]:
%%time
SVC_RBF_100Doc2Vec_model = SVC_RBF_pipeline.fit(X_combined_100, y)

CPU times: user 23min 15s, sys: 15.1 s, total: 23min 30s
Wall time: 23min 47s


In [16]:
# save model
pickle.dump(SVC_RBF_100Doc2Vec_model, open("models/SVC-RBF-100Doc2Vec.sav", "wb"))

## LinearSVC with 50 features Doc2Vec 

In [17]:
%%time
partial_LinearSVC_50Doc2Vec_model = LinearSVC_pipeline.fit(X_train_50, y_train_50)

CPU times: user 9min 4s, sys: 6.6 s, total: 9min 11s
Wall time: 9min 19s


Above had convergence warning 
CPU times: user 9min 4s, sys: 6.6 s, total: 9min 11s
Wall time: 9min 19s
/Users/mcken/.anyenv/envs/pyenv/versions/3.8.5/lib/python3.8/site-packages/sklearn/svm/_base.py:976: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn("Liblinear failed to converge, increase "

In [18]:
partial_LinearSVC_50Doc2Vec_model.score(X_test_50, y_test_50)
# -> 0.7205

0.718375

In [22]:
%%time
LinearSVC_50Doc2Vec_model = LinearSVC_pipeline.fit(X_combined_50, y)

CPU times: user 11min 55s, sys: 7.39 s, total: 12min 3s
Wall time: 13min 36s


Also fail to converge

In [24]:
pickle.dump(LinearSVC_50Doc2Vec_model, open("models/LinearSVC_50Doc2Vec_model.sav", "wb"))

## LinearSVC with 100 features Doc2Vec

Because 50 didnt coverge probably not good idea to implemnent 100 without any feature selection