In [1]:
%load_ext autoreload
%autoreload 2
import io, os, re, logging, sys

sys.path.append("../")

import requests, tqdm, tarfile, itertools, html, time
from dateparser.date import DateDataParser
from dateparser import parse
from fuzzywuzzy import process, fuzz

import numpy as np
import pandas as pd
import dask.dataframe as dd

import torch
import torch.nn.functional as F
import pyro
import pyro.distributions as dist

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, FactorAnalysis

import plotly.express as px
import plotly.graph_objects as go

from src.visualization.visualize import visualize_shot
from src.features.categorical import *
from src.transformations import *
from src.model.utils import *
from src.model.autoencoder import ShotsAutoEncForecast

from tsa.dataset import TimeSeriesDataset
from tsa.model import AutoEncForecast
from tsa.config import config
from tsa.train import train
from tsa.eval import evaluate 

import wandb
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping
from pytorch_lightning import loggers

# Import Dataset

In [2]:
shots_df = pd.read_parquet("../data/raw/visualizercoffee_73202shots_2021-12-30_cleaned.parquet")
shots_df.shape

(73202, 37)

## Data transformations

In [3]:
shots_df["yield_ratio"] = shots_df["drink_weight"] / shots_df["bean_weight"]

max_espresso_weight = shots_df["espresso_weight"].apply(lambda x: max(x) if x is not None else None).replace({0: np.NaN})
shots_df["drink_weight"].fillna(max_espresso_weight)

shots_df["max_pressure"] = shots_df["espresso_pressure"].apply(np.max)
shots_df["max_flow"] = shots_df["espresso_flow"].apply(np.max)
shots_df["median_temperature"] = shots_df["espresso_temperature_basket"].apply(np.median)
shots_df["mean_resistance"] = shots_df["espresso_resistance"].apply(lambda x: np.mean(x) if x is not None else None)

shots_df["roast_agtron"] = shots_df["roast_level"]
shots_df["roast_level"] = agtron_to_roast_level(shots_df["roast_level"])
shots_df["grinder_setting"] = shots_df.groupby('grinder_model')['grinder_setting'].apply(lambda x: (x-min(x))/(max(x)-min(x)))

shots_df.head()

Unnamed: 0,id,profile_title,user_id,drink_tds,drink_ey,espresso_enjoyment,bean_weight,drink_weight,grinder_model,grinder_setting,...,espresso_temperature_basket,duration,user_name,bean_age,yield_ratio,max_pressure,max_flow,median_temperature,mean_resistance,roast_agtron
0,14899484-a5f8-4ccc-b372-79700c8150ae,TurboBloom,8d827d10-510f-4475-bd54-63c93b275c16,,,80.0,15.0,35.2,Lagom P64,0.014737,...,"[86.0, 85.12, 85.27, 85.49, 85.38, 85.35, 85.3...",18.493,,,2.346667,5.95,8.27,84.38,0.411447,91.0
1,76f936ee-5147-40ac-986e-f0f1be5ac97e,Best overall pressure profile,10f1e281-8fd0-4f43-94f6-d416713dabe0,,,,18.0,36.1,Bentwood 63,0.121212,...,"[88.0, 87.18, 87.39, 87.37, 87.31, 87.23, 87.0...",29.475,,,2.005556,8.38,4.11,87.64,1.603333,
2,8853ae7c-5bf8-4194-93dd-462de63f1471,Default,30b3dd10-aebb-4f17-ba74-a1efcf2d51ea,,,,,40.3,,,...,"[90.5, 87.93, 88.21, 88.23, 88.16, 88.14, 88.1...",35.054,,,,8.9,4.09,88.69,4.545352,75.0
3,a4d7a358-fc3f-451c-92f5-756adaa4a7dd,Rao Allongé,c8c4793c-6708-4e81-90e8-6d8a5b30ceee,,,,,124.5,Niche Zero,0.0675,...,"[92.0, 90.0, 90.41, 90.36, 90.35, 90.44, 90.4,...",40.996,,,,8.76,4.84,91.75,0.359096,
4,3ebcfff9-fb6f-47f6-a3df-affe01d55150,Londonium,db110ce3-77b7-4c92-9d18-1019c3841cc9,,,,,,,,...,"[89.0, 87.55, 87.81, 87.75, 87.76, 87.8, 87.74...",38.787,,,,9.18,6.75,87.99,17.786497,


In [8]:
shots_df["grinder_model"].value_counts().head(20)

Niche Zero                  11059
Sette 270                    2389
Mahlkonig EK43               1930
Lagom P100                   1863
Bentwood 63                  1852
Lagom P64                    1709
Kafatek Monolith Flat        1534
Kafatek Monolith Max         1371
DF64                          850
Eureka Mignon Specialita      604
Weber EG-1                    432
Titus M3                      394
Baratza Forté                 392
Kinu M47 Simplicity           386
LeverCraft Ultra              306
Mazzer Mini                   252
Baratza Vario                 239
1Zpresso JX-PRO               208
Eureka Atom Specialty 75      196
Macap M4                      191
Name: grinder_model, dtype: int64

## Grindsize vs shot time

In [4]:
# Most popular grinder settings
shots_df.groupby(["grinder_model"])["grinder_setting"].value_counts().sort_values(ascending=False)[:20]

grinder_model   grinder_setting
Niche Zero      0.060000           812
                0.065000           806
                0.070000           785
                0.085000           689
                0.075000           678
                0.055000           604
                0.050000           520
                0.080000           508
                0.090000           405
Baratza Forté   0.000000           385
Niche Zero      0.095000           319
Bentwood 63     0.136364           315
Niche Zero      0.040000           297
                0.045000           286
Lagom P64       0.093684           257
Niche Zero      0.062500           245
                0.035000           237
                0.067500           216
                0.077500           208
Mahlkonig EK43  0.153333           207
Name: grinder_setting, dtype: int64

### Since shot time is dependent on the profile, we look at Grindsize vs shot time for the top 5 profiles independently

In [7]:
most_popular_profiles = shots_df["profile_title"].value_counts()
most_popular_profiles.head(20)

Default                          5158
Damian's LRv3                    5101
Londonium                        3806
Damian's LRv2                    3657
Gentle and sweet                 3209
Blooming Espresso                2706
Best overall pressure profile    2207
Best practice (medium roast)     2173
Adaptive (for medium roasts)     2099
Rao Allongé                      1975
Cremina lever machine            1741
Nu Skool Espresso v3             1251
DEK/Blooming Espresso             740
Gagné/Adaptive Shot 94C v1.0      706
Traditional lever machine         690
Best practice (light roast)       646
Tea portafilter/black tea         589
Hendon Turbo 6b Decline           583
Classic Italian espresso          555
TurboBloom                        549
Name: profile_title, dtype: int64

In [None]:
niche_shots = shots_df[shots_df["grinder_model"] == "Niche Zero"]

df = niche_shots[niche_shots["profile_title"].isin(most_popular_profiles.index[:5]) \
                 & niche_shots["roast_level"].notnull() \
                 # & niche_shots["yield_ratio"].notnull()
                ]

fig = px.scatter(df, 
                 x='grinder_setting', y='shot_time', 
                 facet_col='profile_title',
                 # facet_row='roast_level',
                 color="yield_ratio", 
                 # range_color=(0, 50),
                 trendline="ols",
                 title='Niche Zero: larger grind setting leads to shorter shot time',
                 height=500, width=1200,
                 )

fig
# .show(renderer="png")

# PCA of features

In [9]:
num_cols = shots_df.select_dtypes('number').columns
num_cols

Index(['drink_tds', 'drink_ey', 'espresso_enjoyment', 'bean_weight',
       'drink_weight', 'grinder_setting', 'duration', 'bean_age',
       'yield_ratio', 'max_pressure', 'max_flow', 'median_temperature',
       'mean_resistance', 'roast_agtron'],
      dtype='object')

In [10]:
shots_df['mean_resistance'][shots_df['mean_resistance'] > 80000] = np.NaN
shots_df[['max_flow', 'median_temperature','mean_resistance']].max(axis=0)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


max_flow                 14.930000
median_temperature      110.000000
mean_resistance       79147.457577
dtype: float64

In [11]:
X = shots_df.filter(num_cols.difference(['drink_tds', 'drink_ey', 'espresso_enjoyment', 'roast_agtron']), axis=1)

scaler = StandardScaler(with_mean=True, with_std=True)
X_scale = pd.DataFrame(scaler.fit_transform(X), 
                       index=X.index, columns=X.columns)

X_scale = X_scale.dropna(axis=0)
print(X_scale.shape)
X.isnull().sum()

(7271, 10)


bean_age              60474
bean_weight           32634
drink_weight          10203
duration                  0
grinder_setting       45970
max_flow                  0
max_pressure              0
mean_resistance       16207
median_temperature        0
yield_ratio           36032
dtype: int64

In [12]:
X_scale

Unnamed: 0,bean_age,bean_weight,drink_weight,duration,grinder_setting,max_flow,max_pressure,mean_resistance,median_temperature,yield_ratio
5,0.492323,0.159861,-0.107113,-0.707964,0.004954,0.563125,-0.910497,-0.091809,0.490382,-0.176163
16,-0.609085,0.159861,0.015271,-0.626582,0.085658,-0.314441,-0.873249,-0.091821,0.044209,-0.073898
19,0.991038,0.159861,-0.434928,-0.887172,-0.452885,1.009983,0.016036,-0.091868,-0.420414,-0.450087
47,-0.737543,0.159861,0.019642,-0.354775,5.250708,-1.315834,2.097243,-0.091667,0.305874,-0.070246
63,-0.189860,0.159861,-0.399961,-0.051123,0.126010,1.602205,0.588717,-0.089985,-0.029594,-0.420869
...,...,...,...,...,...,...,...,...,...,...
70340,-0.608536,-0.262938,-0.373736,0.017593,-0.116101,0.509286,-0.742883,-0.062729,0.463544,-0.323115
70345,-0.712467,0.159861,-0.338769,-0.139782,0.509354,0.676185,0.169682,-0.073054,0.362624,-0.369736
70384,-0.724066,0.159861,-0.330027,-0.175044,0.509354,0.966912,0.090531,-0.090436,0.234960,-0.362431
70387,-0.624533,-0.262938,-0.347511,-1.167503,-0.116101,0.520054,-0.519398,0.388758,0.988552,-0.299912


In [13]:
# df = X.corr()
# df.style.background_gradient("viridis", axis=None)

In [14]:
pca = PCA(n_components=8)
# fa = FactorAnalysis(n_components=8)
X_pca = pd.DataFrame(pca.fit_transform(X_scale), index=X_scale.index)

# px.line(np.cumsum(pca.explained_variance_ratio_))

In [None]:
fig = px.scatter_matrix(X_pca, 
                        dimensions=range(2),
                        # color=shots_df.loc[X_pca.index, "profile_title"].str.slice(0, 25),
                        color=shots_df.loc[X_pca.index, "roast_agtron"],
                       )

fig.update_traces(diagonal_visible=False)
fig.update_layout(height=1200, width=1200)
fig.show(renderer="png")

# Autoencoder for time series

In [3]:
target_cols = ['espresso_flow', 'espresso_weight', 'espresso_pressure',
                # 'espresso_resistance', 
               'espresso_flow_weight',
                'espresso_temperature_basket'
               ]

In [4]:
config["output_size"] = len(target_cols)
config["label_col"] = target_cols
config

{'device': device(type='cuda'),
 'categorical_cols': ['Time'],
 'label_col': ['espresso_flow',
  'espresso_weight',
  'espresso_pressure',
  'espresso_flow_weight',
  'espresso_temperature_basket'],
 'index_col': 'Date',
 'output_size': 5,
 'num_epochs': 100,
 'batch_size': 16,
 'lr': 1e-05,
 'reg1': True,
 'reg2': False,
 'reg_factor1': 0.0001,
 'reg_factor2': 0.0001,
 'seq_len': 10,
 'prediction_window': 1,
 'hidden_size_encoder': 128,
 'hidden_size_decoder': 128,
 'input_att': True,
 'temporal_att': True,
 'denoising': False,
 'directions': 1,
 'max_grad_norm': 0.1,
 'gradient_accumulation_steps': 1,
 'logging_steps': 100,
 'lrs_step_size': 5000,
 'output_dir': 'output',
 'save_steps': 5000,
 'eval_during_training': True}

## Build dataset

In [6]:
shots_training = shots_df[~shots_df[TIMESERIES_COLS].apply(lambda x: x.apply(lambda y: np.isnan(y).any() if y is not None else True)).any(axis=1)]
shots_training.shape

(56217, 37)

##  Test dataset

In [6]:
shots_series = extract_shot_series(shots_training.sample(100))
shots_series = resample_shot_series(shots_series)
shots_series.replace({np.inf: None}, inplace=True)
shots_series.shape

(7819, 14)

In [7]:
shots_series.isna().sum()

espresso_flow                    0
espresso_weight                  0
espresso_pressure                0
espresso_flow_goal               0
espresso_resistance            647
espresso_flow_weight             0
espresso_state_change            0
espresso_pressure_goal           0
espresso_flow_weight_raw         0
espresso_temperature_mix         0
espresso_water_dispensed         0
espresso_temperature_goal        0
espresso_resistance_weight       0
espresso_temperature_basket      0
dtype: int64

In [11]:
ts = TimeSeriesDataset(
    data=shots_series.drop(["espresso_resistance"], axis=1),
    categorical_cols=[],
    # categorical_cols=['grinder_model', 'roast_level'],
    # target_col=["espresso_enjoyment"],
    target_col=target_cols,
    seq_length=config["seq_len"],
    prediction_window=config["prediction_window"],
)

In [12]:
train_iter, test_iter, nb_features = ts.get_loaders(batch_size=config["batch_size"])
temp_iter = iter(train_iter)

nb_features

8

In [13]:
X, y_hist, target = next(temp_iter)
tensor_sizes({"X": X, "y": y_hist, "z": target})

{'X': [32, 64, 8], 'y': [32, 64, 5], 'z': [32, 5]}

## Build Model

In [1]:
config['batch_size'] = 2**9
config["seq_len"] = 64
config['weight_decay'] = 1e-3

model = ShotsAutoEncForecast(shots_training.sample(10000), config)
model.hparams

NameError: name 'config' is not defined

## Training

In [8]:
max_epochs = 1000

wandb_logger = WandbLogger(name='ShotsAutoEncForecast', 
                           tags=['Visualizer.Coffee'],
                           project="espresso-manifold",
                           anonymous="allow")
wandb_logger.log_hyperparams(config)

trainer = Trainer(
    gpus=1,
    auto_lr_find=False,
    max_epochs=max_epochs, 
    callbacks=[EarlyStopping(monitor='val_loss', patience=5)],
    logger=wandb_logger,
    weights_summary='top',
    precision=16
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33manony-mouse-76758[0m (use `wandb login --relogin` to force relogin)
2022-01-02 16:37:53.694173: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcudart.so.10.1'; dlerror: libcudart.so.10.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64
2022-01-02 16:37:53.694195: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [10]:
trainer.fit(model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type        | Params
------------------------------------------
0 | encoder   | AttnEncoder | 71.5 K
1 | decoder   | AttnDecoder | 120 K 
2 | criterion | MSELoss     | 0     
------------------------------------------
191 K     Trainable params
0         Non-trainable params
191 K     Total params
0.384     Total estimated model params size (MB)


Epoch 0: 100%|████| 6002/6002 [3:27:05<00:00,  2.07s/it, loss=0.974, v_num=avcy]
Epoch 0:  12%|▉       | 713/6002 [02:08<15:49,  5.57it/s, loss=1.18, v_num=avcy]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
Process wandb_internal:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/opt/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/anaconda3/lib/python3.7/site-packages/wandb/sdk/internal/internal.py", line 152, in wandb_internal
    thread.join()
  File "/opt/anaconda3/lib/python3.7/threading.py", line 1044, in join
    self._wait_for_tstate_lock()
  File "/opt/anaconda3/lib/python3.7/threading.py", line 1060, in _wait_for_tstate_lock
    elif lock.acquire(block, timeout):
KeyboardInterrupt
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/anaconda3/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main
    exitcode = _main(fd)
  File "/opt/anaconda3/lib/python3.7/multiprocessing/spawn.py"

## Evaluate

In [None]:
trainer.test(model)

Epoch 0: 100%|██████| 6002/6002 [15:09<00:00,  6.60it/s, loss=0.974, v_num=avcy]