In [None]:
from read_data import get_training, get_test, get_sparse, get_Doc2Vec
from processing import exclude_non_numeric, combine_with_vec
from utils import create_csv_output
import numpy as np
import pandas as pd
import scipy
import pickle
from scipy.sparse import csr_matrix
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit, StratifiedShuffleSplit
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
np.random.seed(30027)
sns.set(rc={"figure.facecolor": "white"})

In [None]:
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report

In [None]:
train_X,train_y = get_training()
train_X_numeric = csr_matrix(FunctionTransformer(np.log1p).fit_transform(exclude_non_numeric(train_X)))
train_name, train_ingr, train_steps = get_sparse(data="train")
train_sparse = scipy.sparse.hstack((train_X_numeric, train_name, train_ingr, train_steps), format="csr")

In [None]:
# Change target from (1.0, 2.0, 3.0) -> (0, 1, 2)
y = (train_y - 1).astype(int)
X_train, X_test, y_train, y_test = train_test_split(train_sparse, y, test_size=0.2)

RandomForest and LightGBM

In [None]:
partial_LGBM = pickle.load(open("report_models/partial/LGBM.sav", "rb"))

In [None]:
LGBM_pred = partial_LGBM.predict(X_test)

In [None]:
print(classification_report(y_test, LGBM_pred, target_names=["1.0", "2.0", "3.0"]))

In [None]:
partial_RF = pickle.load(open("report_models/partial/rf.sav", "rb"))
RF_pred = partial_RF.predict(X_test)

In [None]:
print(classification_report(y_test, RF_pred, target_names=["1.0", "2.0", "3.0"]))

In [None]:
partial_SVM = pickle.load(open("report_models/partial/SVC_RBF.sav", "rb"))

In [None]:
SVM_Pred = partial_SVM.predict(X_test)

In [None]:
print(classification_report(y_test, SVM_Pred, target_names=["1.0", "2.0", "3.0"]))

In [None]:
partial_NN = pickle.load(open("report_models/partial/MLP.sav", "rb"))

In [None]:
NN_pred = partial_NN.predict(X_test)

In [None]:
print(classification_report(y_test, NN_pred, target_names=["1.0", "2.0", "3.0"]))

In [None]:
X

In [None]:
data = pd.DataFrame(pd.concat([train_X, y+1], axis=1))

In [None]:
data["Indexes"] = data["name"].str.find("slow")

In [None]:
data[data["duration_label"] == 3]["duration_label"]

In [None]:
data[data["Indexes"] != -1]

In [None]:
tr_X, t_X, tr_y, t_y = train_test_split(train_X, train_y, test_size=0.2)

In [None]:
train_data_all = pd.concat([tr_X, tr_y], axis=1)

In [None]:
train_data_all["Index"] = data["name"].str.find("cooker")

In [None]:
train_data_all[train_data_all["Index"] != -1]

In [None]:
sns.countplot(train_data_all[train_data_all["Index"] != -1]["duration_label"])
plt.title("train data that contains word \"cooker\" in the steps")
plt.savefig("report_pics/error_analysis/cooker_train.png")

In [None]:
test_data_all = pd.concat([t_X, t_y], axis=1)

In [None]:
test_data_all["Index"] = data["name"].str.find("cooker")

In [None]:
sns.countplot(test_data_all[test_data_all["Index"] != -1]["duration_label"])
plt.title("test data that contains word \"cooker\" in the steps")
plt.savefig("report_pics/error_analysis/cooker_test.png")

In [None]:
test_data_all["prediction"] = (LGBM_pred + 1).astype(float)

In [None]:
test_data_all[test_data_all["Index"] != -1][test_data_all["duration_label"] == 3.0].count()

In [None]:
test_data_all[test_data_all["Index"] != -1][test_data_all["duration_label"] == 3.0][test_data_all["duration_label"] != test_data_all["prediction"]]

In [None]:
test_data_all[test_data_all["Index"] != -1][test_data_all["duration_label"] == test_data_all["prediction"]]