In [None]:
from read_data import get_training, get_test, get_sparse
from processing import exclude_non_numeric
from utils import create_csv_output
import numpy as np
import pandas as pd
import scipy
import pickle
from scipy.sparse import csr_matrix
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit, StratifiedShuffleSplit
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
np.random.seed(30027)
sns.set(rc={"figure.facecolor": "white"})

In [None]:
from sklearn.linear_model import LogisticRegression, SGDClassifier

## Train data

In [None]:
train_X,train_y = get_training()
train_X_numeric = csr_matrix(FunctionTransformer(np.log1p).fit_transform(exclude_non_numeric(train_X)))
train_name, train_ingr, train_steps = get_sparse(data="train")
train_sparse = scipy.sparse.hstack((train_X_numeric, train_name, train_ingr, train_steps), format="csr")

In [None]:
# Change target from (1.0, 2.0, 3.0) -> (0, 1, 2)
y = (train_y - 1).astype(int)
X_train, X_test, y_train, y_test = train_test_split(train_sparse, y, test_size=0.2)

## Test data

In [None]:
test_X = get_test()
test_X_numeric = csr_matrix(FunctionTransformer(np.log1p).fit_transform(exclude_non_numeric(test_X)))
test_name, test_ingr, test_steps = get_sparse(data="test")
test_sparse = scipy.sparse.hstack((test_X_numeric, test_name, test_ingr, test_steps), format="csr")

## Logistic Regression

### Partial model

In [None]:
%%time
partial_lr = LogisticRegression().fit(X_train, y_train)
pickle.dump(partial_lr, open("report_models/partial/lr.sav", "wb"))
# CPU times: user 9.97 s, sys: 21.5 s, total: 31.5 s
# Wall time: 7.27 

In [None]:
cross_val_score(partial_lr, X_test, y_test, cv=StratifiedShuffleSplit(n_splits=5, test_size=0.2))
# array([0.7775, 0.7725, 0.7725, 0.76  , 0.7825])

### Analysis

In [None]:
lr_score = partial_lr.score(X_test, y_test)
lr_pred = partial_lr.predict(X_test)
lr_cm = confusion_matrix(y_test, lr_pred, normalize='true')

In [None]:
plt.figure(figsize=(5, 5))
sns.heatmap(lr_cm, annot=True, square=True, xticklabels=[1.0, 2.0, 3.0], yticklabels=[1.0, 2.0, 3.0], cmap="Blues_r")
plt.ylabel('Actual label')
plt.xlabel('Predicted label' + '\n\nAccuracy Score: {0}'.format(lr_score))
plt.title('LogisticRegressionClassifier Confusion Matrix', size=12)
plt.savefig("report_pics/LinearModels/lr_cm.png")

### Submission model

In [None]:
%%time
lr = LogisticRegression().fit(train_sparse, y)
pickle.dump(lr, open("report_models/submission/lr.sav", "wb"))
# CPU times: user 12.9 s, sys: 27.9 s, total: 40.8 s
# Wall time: 8.05 s

In [None]:
create_csv_output("report_submission/lr", (lr.predict(test_sparse) + 1).astype(float))

## SGDClassifier (Stochastic Gradient Decent)
Good for large data(??)

### Partila model

In [None]:
%%time
partial_SGD = SGDClassifier().fit(X_train, y_train)
pickle.dump(partial_SGD, open("report_models/partial/SGD.sav", "wb"))
# CPU times: user 2.74 s, sys: 609 ms, total: 3.35 s
# Wall time: 2.27 s

In [None]:
cross_val_score(partial_SGD, X_test, y_test, cv=StratifiedShuffleSplit(n_splits=5, test_size=0.2))
# array([0.685   , 0.679375, 0.710625, 0.67875 , 0.695625])

### Analysis

In [None]:
SGD_score = partial_SGD.score(X_test, y_test)
SGD_pred = partial_SGD.predict(X_test)
SGD_cm = confusion_matrix(y_test, SGD_pred, normalize='true')

In [None]:
plt.figure(figsize=(5, 5))
sns.heatmap(SGD_cm, annot=True, square=True, xticklabels=[1.0, 2.0, 3.0], yticklabels=[1.0, 2.0, 3.0], cmap="Blues_r")
plt.ylabel('Actual label')
plt.xlabel('Predicted label' + '\n\nAccuracy Score: {0}'.format(SGD_score))
plt.title('SGDClassifier Confusion Matrix', size=13)
plt.savefig("report_pics/LinearModels/SGD_cm.png")

### Submission model

In [None]:
%%time
SGD = SGDClassifier().fit(train_sparse, y)
pickle.dump(SGD, open("report_models/submission/SGD.sav", "wb"))
# CPU times: user 12.9 s, sys: 27.9 s, total: 40.8 s
# Wall time: 8.05 s

In [None]:
create_csv_output("report_submission/SGD", (SGD.predict(test_sparse) + 1).astype(float))