In [None]:
from read_data import get_training, get_test, get_sparse
from processing import exclude_non_numeric
from utils import create_csv_output
import numpy as np
import pandas as pd
import scipy
import pickle
from scipy.sparse import csr_matrix
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit, StratifiedShuffleSplit
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
np.random.seed(30027)
sns.set(rc={"figure.facecolor": "white"})

In [None]:
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

## Train data

In [None]:
train_X,train_y = get_training()
train_X_numeric = csr_matrix(FunctionTransformer(np.log1p).fit_transform(exclude_non_numeric(train_X)))
train_name, train_ingr, train_steps = get_sparse(data="train")
train_sparse = scipy.sparse.hstack((train_X_numeric, train_name, train_ingr, train_steps), format="csr")

In [None]:
# Change target from (1.0, 2.0, 3.0) -> (0, 1, 2)
y = (train_y - 1).astype(int)
X_train, X_test, y_train, y_test = train_test_split(train_sparse, y, test_size=0.2)

## Test data

In [None]:
test_X = get_test()
test_X_numeric = csr_matrix(FunctionTransformer(np.log1p).fit_transform(exclude_non_numeric(test_X)))
test_name, test_ingr, test_steps = get_sparse(data="test")
test_sparse = scipy.sparse.hstack((test_X_numeric, test_name, test_ingr, test_steps), format="csr")

## RandomForestClassfier

In [None]:
rf_pipeline = make_pipeline(
    StandardScaler(with_mean=False),
    RandomForestClassifier()
)

### Partial Model

In [None]:
%%time
partial_rf = rf_pipeline.fit(X_train, y_train)
pickle.dump(partial_rf, open("report_models/partial/rf.sav", "wb"))
# CPU times: user 1min 32s, sys: 2.32 s, total: 1min 34s
# Wall time: 1min 37s

In [None]:
cross_val_score(partial_rf, X_test, y_test, cv=StratifiedShuffleSplit(n_splits=5, test_size=0.2))
# array([0.76    , 0.7725  , 0.755   , 0.75875 , 0.769375])

### Analysis

In [None]:
rf_score = partial_rf.score(X_test, y_test)
rf_pred = partial_rf.predict(X_test)
rf_cm = confusion_matrix(y_test, rf_pred, normalize='true')

In [None]:
rf_cm

In [None]:
plt.figure(figsize=(5, 5))
sns.heatmap(rf_cm, annot=True, square=True, xticklabels=[1.0, 2.0, 3.0], yticklabels=[1.0, 2.0, 3.0], cmap="Blues_r")
plt.ylabel('Actual label')
plt.xlabel('Predicted label' + '\n\nAccuracy Score: {0}'.format(rf_score))
plt.title('RandomForestClassifier Confusion Matrix', size=13)
plt.savefig("report_pics/RandomForest/rf_cm.png")

### Submission model

In [None]:
%%time
rf = rf_pipeline.fit(train_sparse, y)
pickle.dump(rf, open("report_models/submission/rf.sav", "wb"))
# CPU times: user 1min 46s, sys: 2.25 s, total: 1min 48s
# Wall time: 1min 51s

In [None]:
create_csv_output("report_submission/rf", (rf.predict(test_sparse) + 1).astype(float))

## LightGBM

In [None]:
LGBM_pipeline = make_pipeline(
    StandardScaler(with_mean=False),
    LGBMClassifier()
)

### partial model

In [None]:
%%time
partial_LGBM = LGBM_pipeline.fit(X_train, y_train)
pickle.dump(partial_LGBM, open("report_models/partial/LGBM.sav", "wb"))
# CPU times: user 1min 4s, sys: 5.96 s, total: 1min 10s
# Wall time: 10.1 s

In [None]:
cross_val_score(partial_LGBM, X_test, y_test, cv=StratifiedShuffleSplit(n_splits=5, test_size=0.2))
# array([0.80625 , 0.79125 , 0.795625, 0.795   , 0.800625])

### Analysis

In [None]:
LGBM_score = partial_LGBM.score(X_test, y_test)
LGBM_pred = partial_LGBM.predict(X_test)
LGBM_cm = confusion_matrix(y_test, LGBM_pred, normalize='true')

In [None]:
plt.figure(figsize=(5, 5))
sns.heatmap(LGBM_cm, annot=True, square=True, xticklabels=[1.0, 2.0, 3.0], yticklabels=[1.0, 2.0, 3.0], cmap="Blues_r")
plt.ylabel('Actual label')
plt.xlabel('Predicted label' + '\n\nAccuracy Score: {0}'.format(LGBM_score))
plt.title('RandomForestClassifier Confusion Matrix', size=13)
plt.savefig("report_pics/RandomForest/LGBM_cm.png")

### Submission model

In [None]:
%%time
LGBM = LGBM_pipeline.fit(train_sparse, y)
pickle.dump(LGBM, open("report_models/submission/LGBM.sav", "wb"))
# CPU times: user 1min 46s, sys: 2.25 s, total: 1min 48s
# Wall time: 1min 51s

In [None]:
create_csv_output("report_submission/LGBM", (LGBM.predict(test_sparse) + 1).astype(float))