In [1]:
artefact_prefix = '2_pytorch'
target = 'beer_style'

In [2]:
from dotenv import find_dotenv
from datetime import datetime
import pandas as pd
from pathlib import Path
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from category_encoders.binary import BinaryEncoder
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from joblib import dump, load

# from src.data.sets import merge_categories
# from src.data.sets import save_sets
from src.data.sets import load_sets 
# from src.data.sets import split_sets_random
# from src.data.sets import test_class_exclusion
# from src.models.performance import convert_cr_to_dataframe

from src.models.pytorch import PytorchClassification_2
from src.models.pytorch import get_device
from src.models.pytorch import train_classification
from src.models.pytorch import test_classification
from src.models.pytorch import PytorchDataset
from src.models.pipes import create_preprocessing_pipe
from src.visualization.visualize import plot_confusion_matrix

### Directory Set up

In [3]:
project_dir = Path(find_dotenv()).parent
data_dir = project_dir / 'data'
raw_data_dir = data_dir / 'raw'
interim_data_dir = data_dir / 'interim'
processed_data_dir = data_dir / 'processed'
reports_dir = project_dir / 'reports'
models_dir = project_dir / 'models'

In [4]:
processed_data_dir

PosixPath('/home/jovyan/work/data/processed')

### Load Save Data

In [4]:
## Panda Data type
from src.data.sets import load_sets 

X_train, X_test, X_val, y_train, y_test, y_val = load_sets()

In [5]:
y_train['beer_style'].nunique()

104

In [6]:
X_train.head()

Unnamed: 0,brewery_name,review_aroma,review_appearance,review_palate,review_taste
0,"Kirin Brewery Company, Limited",1.5,3.0,3.0,3.5
1,Huisbrouwerij Klein Duimpje,3.0,4.0,3.5,3.5
2,Southampton Publick House,3.0,3.5,4.0,3.5
3,Rock Bottom Restaurant & Brewery,3.5,4.0,2.5,3.5
4,Boston Beer Company (Samuel Adams),4.0,3.5,3.5,3.5


### Data Pipeline

In [7]:
pipe = Pipeline([
    ('bin_encoder', BinaryEncoder(cols=['brewery_name'])),
    ('scaler', StandardScaler())
])

In [8]:
X_train_trans = pipe.fit_transform(X_train)
X_val_trans = pipe.transform(X_val)
X_test_trans = pipe.transform(X_test)

In [40]:
X_train_trans.shape

(951968, 18)

In [9]:
n_features = X_train_trans.shape[1]
n_features

18

In [10]:
n_classes = y_train['beer_style'].nunique()
n_classes

104

### Encoding - Label 

In [11]:
le = LabelEncoder()
y_train_trans = le.fit_transform(y_train)
y_val_trans = le.fit_transform(y_val)
y_test_trans = le.transform(y_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [152]:
y_test_trans

array([98, 89,  2, ..., 37, 94, 98])

### Convert to Pytorch Tensor

In [12]:
device = get_device()
device

device(type='cpu')

In [13]:
train_dataset = PytorchDataset(X=X_train_trans, y=y_train_trans)
val_dataset = PytorchDataset(X=X_val_trans, y=y_val_trans)
test_dataset = PytorchDataset(X=X_test_trans, y=y_test_trans)

### Classification Model

In [14]:
model = PytorchClassification_2(n_features=n_features, n_classes=n_classes)
model.to(device)

PytorchClassification_2(
  (layer_1): Linear(in_features=18, out_features=512, bias=True)
  (batchnorm1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layer_2): Linear(in_features=512, out_features=128, bias=True)
  (batchnorm2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layer_3): Linear(in_features=128, out_features=64, bias=True)
  (batchnorm3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layer_out): Linear(in_features=64, out_features=104, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.2, inplace=False)
)

In [15]:
criterion = nn.CrossEntropyLoss()

In [16]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

## Train the Model

In [17]:
N_EPOCHS = 20
BATCH_SIZE = 512
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

In [18]:
start_time = datetime.now()
print(f'Started: {start_time}')
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset,
                                                 model=model,
                                                 criterion=criterion, 
                                                 optimizer=optimizer,
                                                 batch_size=BATCH_SIZE,
                                                 device=device,
                                                 scheduler=scheduler)
    valid_loss, valid_acc = test_classification(val_dataset,
                                                model=model,
                                                criterion=criterion, 
                                                batch_size=BATCH_SIZE, 
                                                device=device)

    print(f'Epoch: {epoch}')
    print(f'\t(train)\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.1f}%')
    print(f'\t(valid)\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.1f}%')

end_time = datetime.now()
runtime = end_time - start_time
print(f'Ended: {end_time}')
print(f'Runtime: {runtime}')

Started: 2021-07-13 07:52:17.349693
Epoch: 0
	(train)	Loss: 0.0064	|	Acc: 19.4%
	(valid)	Loss: 0.0055	|	Acc: 26.1%
Epoch: 1
	(train)	Loss: 0.0057	|	Acc: 23.7%
	(valid)	Loss: 0.0053	|	Acc: 27.2%
Epoch: 2
	(train)	Loss: 0.0056	|	Acc: 24.6%
	(valid)	Loss: 0.0051	|	Acc: 27.8%
Epoch: 3
	(train)	Loss: 0.0055	|	Acc: 25.1%
	(valid)	Loss: 0.0051	|	Acc: 28.1%
Epoch: 4
	(train)	Loss: 0.0054	|	Acc: 25.4%
	(valid)	Loss: 0.0050	|	Acc: 28.3%
Epoch: 5
	(train)	Loss: 0.0054	|	Acc: 25.7%
	(valid)	Loss: 0.0050	|	Acc: 28.4%
Epoch: 6
	(train)	Loss: 0.0053	|	Acc: 25.8%
	(valid)	Loss: 0.0050	|	Acc: 28.6%
Epoch: 7
	(train)	Loss: 0.0053	|	Acc: 26.1%
	(valid)	Loss: 0.0049	|	Acc: 28.8%
Epoch: 8
	(train)	Loss: 0.0053	|	Acc: 26.2%
	(valid)	Loss: 0.0049	|	Acc: 28.9%
Epoch: 9
	(train)	Loss: 0.0053	|	Acc: 26.3%
	(valid)	Loss: 0.0049	|	Acc: 29.1%
Epoch: 10
	(train)	Loss: 0.0052	|	Acc: 26.4%
	(valid)	Loss: 0.0049	|	Acc: 29.1%
Epoch: 11
	(train)	Loss: 0.0052	|	Acc: 26.5%
	(valid)	Loss: 0.0049	|	Acc: 29.2%
Epoch: 12
	(tr

### Retrain the model with lesser EPOCH

In [None]:
N_EPOCHS = 20
BATCH_SIZE = 4096
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

In [None]:
start_time = datetime.now()
print(f'Started: {start_time}')
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset,
                                                 model=model,
                                                 criterion=criterion,
                                                 optimizer=optimizer,
                                                 batch_size=BATCH_SIZE,
                                                 device=device,
                                                 scheduler=scheduler)
    valid_loss, valid_acc = test_classification(val_dataset,
                                                model=model,
                                                criterion=criterion,
                                                batch_size=BATCH_SIZE,
                                                device=device)

    print(f'Epoch: {epoch}')
    print(f'\t(train)\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.1f}%')
    print(f'\t(valid)\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.1f}%')

end_time = datetime.now()
runtime = end_time - start_time
print(f'Ended: {end_time}')
print(f'Runtime: {runtime}')

### Prediction

In [42]:
model.to('cpu')
preds = model(test_dataset.X_tensor).argmax(1)
preds
model.to(device)

PytorchClassification_2(
  (layer_1): Linear(in_features=18, out_features=256, bias=True)
  (batchnorm1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layer_2): Linear(in_features=256, out_features=128, bias=True)
  (batchnorm2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layer_out): Linear(in_features=128, out_features=104, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.2, inplace=False)
)

## Evaluation

### Classification Report

In [44]:
report = classification_report(y_test, le.inverse_transform(preds.cpu()))
print(report)

                                     precision    recall  f1-score   support

                            Altbier       0.34      0.35      0.34      1521
             American Adjunct Lager       0.54      0.73      0.62      6085
           American Amber / Red Ale       0.18      0.23      0.21      9288
         American Amber / Red Lager       0.31      0.33      0.32      1887
                American Barleywine       0.22      0.04      0.07      5390
                 American Black Ale       0.41      0.05      0.09      2394
                American Blonde Ale       0.19      0.04      0.07      2594
                 American Brown Ale       0.25      0.10      0.14      5066
            American Dark Wheat Ale       0.00      0.00      0.00       296
     American Double / Imperial IPA       0.26      0.36      0.30     17159
 American Double / Imperial Pilsner       0.19      0.01      0.03      1109
   American Double / Imperial Stout       0.36      0.47      0.40     1018

## Save Objects for Production

### Save model

In [45]:
path = models_dir / f'{artefact_prefix}_model'
torch.save(model, path.with_suffix('.torch'))

### Save Pipe Object

In [46]:
X = pd.concat([X_train, X_val, X_test])
prod_pipe = create_preprocessing_pipe(X)

path = models_dir / f'{artefact_prefix}_pipe'
dump(prod_pipe, path.with_suffix('.sav'))

['/home/jovyan/work/models/1_pytorch_pipe.sav']

### Save the label encoder

This is required to retrive the name of the beer_style.

In [47]:
path = models_dir / f'{artefact_prefix}_label_encoder'
dump(le, path.with_suffix('.sav'))

['/home/jovyan/work/models/1_pytorch_label_encoder.sav']