In [None]:
from read_data import get_training, get_test, get_Doc2Vec, get_sparse
from processing import combine_with_vec, exclude_non_numeric
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix
import scipy
np.random.seed(30027)

In [None]:
# Training data
X, y = get_training()
train_name_vec50, train_ingr_vec50, train_steps_vec50 = get_Doc2Vec(data="train", num_features=50)
train_name_vec100, train_ingr_vec100, train_steps_vec100 = get_Doc2Vec(data="train", num_features=100)

In [None]:
# Combine Doc2Vec with 50 features and non-numeric train data
temp_X = X.copy()
temp_X = exclude_non_numeric(temp_X)
temp_train_name_vec50 = train_name_vec50.copy()
temp_train_ingr_vec50 = train_ingr_vec50.copy()
temp_train_steps_vec50 = train_steps_vec50.copy()
X_combined_50: pd.DataFrame = combine_with_vec(temp_X, temp_train_name_vec50, temp_train_ingr_vec50, temp_train_steps_vec50)

In [None]:
X_train_50, X_test_50, y_train_50, y_test_50 = train_test_split(X_combined_50, y, test_size=0.2)

In [None]:
LogisticRegression_pipeline = make_pipeline(
    make_column_transformer((FunctionTransformer(np.log1p), ["n_ingredients", "n_steps"]), remainder='passthrough'),
    StandardScaler(),
    LogisticRegression()
)

In [None]:
%%time
partial_LogisticRegression_50Doc2Vec_model = LogisticRegression_pipeline.fit(X_train_50, y_train_50)

In [None]:
cross_val_score(LogisticRegression_pipeline, X_test_50, y_test_50, cv=ShuffleSplit(n_splits=5, test_size=0.2))

In [None]:
partial_LogisticRegression_50Doc2Vec_model.score(X_test_50, y_test_50)

## Sparse Matrix Logistic Regression

In [None]:
X, y = get_training()

In [None]:
X_numeric_sparse = csr_matrix(FunctionTransformer(np.log1p).fit_transform(exclude_non_numeric(X)))
train_name_vec, train_ingr_vec, train_steps_vec = get_sparse(data="train")
train_sparse = scipy.sparse.hstack((X_numeric_sparse, train_name_vec, train_ingr_vec, train_steps_vec), format="csr")
X_train_csr, X_test_csr, y_train_csr, y_test_csr = train_test_split(train_sparse, (y - 1).astype(int), test_size=0.2)

In [None]:
LogisticRegression_sparse_pipeline = make_pipeline(
    StandardScaler(with_mean=False),
    LogisticRegression(warm_start = True)
)

In [None]:
%%time
partial_LR_sparse_model = LogisticRegression_sparse_pipeline.fit(X_train_csr, y_train_csr)

In [None]:
cross_val_score(partial_LR_sparse_model, X_test_csr, y_test_csr, cv=ShuffleSplit(n_splits=5, test_size=0.2))

In [None]:
score = partial_LR_sparse_model.score(X_test_csr, y_test_csr)

In [None]:
from sklearn import metrics
predictions = partial_LR_sparse_model.predict(X_test_csr)
cm = metrics.confusion_matrix(y_test_csr, predictions, normalize = 'true')
print(cm)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(score)
plt.title(all_sample_title, size = 15);