In [None]:
from read_data import get_training, get_test, get_sparse, get_Doc2Vec
from processing import exclude_non_numeric, combine_with_vec
from utils import create_csv_output
import numpy as np
import pandas as pd
import scipy
import pickle
from scipy.sparse import csr_matrix
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit, StratifiedShuffleSplit
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
np.random.seed(30027)
sns.set(rc={"figure.facecolor": "white"})

In [None]:
from sklearn.feature_selection import SelectKBest, chi2, RFE, SelectFromModel, f_classif, mutual_info_classif
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

In [None]:
# Training data
train_X, train_y = get_training()
y = (train_y - 1).astype(int)
train_name_vec50, train_ingr_vec50, train_steps_vec50 = get_Doc2Vec(data="train", num_features=50)
train_name_vec100, train_ingr_vec100, train_steps_vec100 = get_Doc2Vec(data="train", num_features=100)

In [None]:
X_combined_50: pd.DataFrame = combine_with_vec(exclude_non_numeric(train_X), train_name_vec50, train_ingr_vec50, train_steps_vec50)

In [None]:
X_train_50, X_test_50, y_train_50, y_test_50 = train_test_split(X_combined_50, y, test_size=0.2)

In [None]:
X_numeric_sparse = csr_matrix(FunctionTransformer(np.log1p).fit_transform(exclude_non_numeric(train_X)))
train_name_vec, train_ingr_vec, train_steps_vec = get_sparse(data="train")
train_sparse = scipy.sparse.hstack((X_numeric_sparse, train_name_vec, train_ingr_vec, train_steps_vec), format="csr")

In [None]:
X_train_csr, X_test_csr, y_train_csr, y_test_csr = train_test_split(train_sparse, y, test_size=0.2)

## Lasso

In [None]:
alphas = [0.001, 0.002, 0.003, 0.005]

In [None]:
X_train_50.shape

In [None]:
num_features = list()
for alpha in alphas:
    selected_feat = SelectFromModel(Lasso(alpha=alpha)).fit_transform(X_train_50, y_train_50)
    num_features.append(selected_feat.shape[1])

In [None]:
lasso_fs = SelectFromModel(Lasso(alpha=0.003))
lasso_fs.fit(X_train_50, y_train_50)
selected_feat = lasso_fs.transform(X_train_50)

In [None]:
feature_imp = pd.DataFrame(sorted(zip(lasso_fs.estimator_.coef_, X_train_50.columns), reverse=True), columns=["Value", "Feature"])

In [None]:
plt.figure(figsize=(7, 5))
sns.barplot(x="Value", y="Feature", data=feature_imp.iloc[:15,:], palette=("Blues_d"))
plt.title("Feature importance by LASSO for Doc2Vec features")
plt.savefig("report_pics/FeatureEngneering/Lasso_Doc2Vec_feature_imp.png")

In [None]:
lasso_cv_fs = SelectFromModel(Lasso(alpha=0.001))
lasso_cv_fs.fit(X_train_csr, y_train_csr)

In [None]:
features_names = list()
features_names.append("n_steps")
features_names.append("n_ingredients")
for i in range(train_name_vec.shape[1]):
    features_names.append("name_" + str(i))
for i in range(train_ingr_vec.shape[1]):
    features_names.append("ingr" + str(i))
for i in range(train_steps_vec.shape[1]):
    features_names.append("steps" + str(i))

In [None]:
selected_cv_feat = lasso_cv_fs.transform(X_train_csr)
feature_cv_imp = pd.DataFrame(sorted(zip(lasso_cv_fs.estimator_.coef_, features_names), reverse=True), columns=["Value", "Feature"])

In [None]:
plt.figure(figsize=(8, 5))
sns.barplot(x="Value", y="Feature", data=feature_cv_imp.iloc[:15,:], palette=("Blues_d"))
plt.title("Feature importance by LASSO with CountVectorizer features")
plt.savefig("report_pics/FeatureEngneering/Lasso_CountVectorizer_feature_imp.png")

## Mutual Information

In [None]:
mi_selector = SelectKBest(mutual_info_classif, k=23)
mi_selector.fit(X_train_50, y_train_50)

In [None]:
selected_mi_feat = mi_selector.transform(X_train_50)

In [None]:
mi_imp = pd.DataFrame(sorted(zip(mi_selector.scores_, X_train_50.columns), reverse=True), columns=["Value", "Feature"])

In [None]:
plt.figure(figsize=(8, 5))
sns.barplot(x="Value", y="Feature", data=mi_imp.iloc[:15,:], palette=("Blues_d"))
plt.title("Feature importance by Mutual Information with Doc2Vec features")
plt.savefig("report_pics/FeatureEngneering/mi_Doc2Vec.png")

In [None]:
mi_cv_selector = SelectKBest(mutual_info_classif, k=300)
mi_cv_selector.fit(X_train_csr, y_train_csr)

In [None]:
selected_mi_cv_feat = mi_cv_selector.transform(X_train_csr)
mi_cv_imp = pd.DataFrame(sorted(zip(mi_cv_selector.scores_, features_names), reverse=True), columns=["Value", "Feature"])

In [None]:
plt.figure(figsize=(8, 5))
sns.barplot(x="Value", y="Feature", data=mi_cv_imp.iloc[:15,:], palette=("Blues_d"))
plt.title("Feature importance by Mutual Information with CountVectorizer features")
plt.savefig("report_pics/FeatureEngneering/mi_CountVectorizer.png")

In [None]:
score_Doc2Vec = list()
for i in range(5, 153, 5):
    lr_fs_pipeline = make_pipeline(SelectKBest(mutual_info_classif, k=i),LogisticRegression())
    score_Doc2Vec.append(lr_fs_pipeline.fit(X_train_50, y_train_50).score(X_test_50, y_test_50))

In [None]:
num_f_c2v = [n for n in range(5, 152, 5)]

In [None]:
plt.figure(figsize=(7, 5))
sns.lineplot(x=num_f_c2v, y=score_Doc2Vec)
plt.title("Doc2Vec Number of features vs Score of LogisticRegression", size=13)
plt.xlabel("Number of features")
plt.ylabel("Score")
plt.savefig("report_pics/FeatureEngneering/num_f_vs_score_Doc2Vec.png")

In [None]:
score_cv = list()
for i in range(1000, X_train_csr.shape[1], 1000):
    lr_fs_pipeline = make_pipeline(SelectKBest(mutual_info_classif, k=i), LogisticRegression())
    score_cv.append(lr_fs_pipeline.fit(X_train_csr, y_train_csr).score(X_test_csr, y_test_csr))

In [None]:
num_f_cv = [i for i in range(1000, X_train_csr.shape[1], 1000)]

In [None]:
plt.figure(figsize=(7, 5))
sns.lineplot(x=num_f_cv, y=score_cv)
plt.title("CountVectorizer Number of features vs Score of LogisticRegression", size=13)
plt.xlabel("Number of features")
plt.ylabel("Score")
plt.ylim((0.75, 0.81))
plt.savefig("report_pics/FeatureEngneering/num_f_vs_score_CV.png")

In [None]:
score_cv