In [1]:
import pandas as pd
import numpy as np

import os
import sys
sys.path.append(os.path.abspath('../'))

from src.data.utility import DataReader, BeerData

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

%load_ext autoreload
%autoreload 2

In [2]:
# Instantiate the custom data reader class
data_reader = DataReader()

In [3]:
# Load Raw Train Data
train_df = data_reader.read_data(BeerData.RAW)

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586614 entries, 0 to 1586613
Data columns (total 13 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   brewery_id          1586614 non-null  int64  
 1   brewery_name        1586599 non-null  object 
 2   review_time         1586614 non-null  int64  
 3   review_overall      1586614 non-null  float64
 4   review_aroma        1586614 non-null  float64
 5   review_appearance   1586614 non-null  float64
 6   review_profilename  1586266 non-null  object 
 7   beer_style          1586614 non-null  object 
 8   review_palate       1586614 non-null  float64
 9   review_taste        1586614 non-null  float64
 10  beer_name           1586614 non-null  object 
 11  beer_abv            1518829 non-null  float64
 12  beer_beerid         1586614 non-null  int64  
dtypes: float64(6), int64(3), object(4)
memory usage: 157.4+ MB


In [6]:
df_cleaned = train_df.copy()
target_column = 'beer_style'
col_to_drop = ['brewery_id', 'review_profilename', 'review_time']
df_cleaned = df_cleaned.drop(col_to_drop, axis = 1)

df_cleaned.head()

Unnamed: 0,brewery_name,review_overall,review_aroma,review_appearance,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,Vecchio Birraio,1.5,2.0,2.5,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,Vecchio Birraio,3.0,2.5,3.0,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,Vecchio Birraio,3.0,2.5,3.0,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,Vecchio Birraio,3.0,3.0,3.5,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,Caldera Brewing Company,4.0,4.5,4.0,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883


In [7]:
df_cleaned.shape

(1586614, 10)

In [8]:
from src.data.sets import split_sets_random, save_sets, load_sets

X_train, y_train, X_val, y_val, X_test, y_test = split_sets_random(df_cleaned, target_column, test_ratio=0.2, to_numpy=True)
save_sets(X_train, y_train, X_val, y_val, X_test, y_test)

In [7]:
X_train
#y_train
# print("X_train", X_train.shape)
# print("X_val", X_val.shape)
# print("X_test", X_test.shape)

array([[2.92000000e+02, 1.19239079e+09, 4.50000000e+00, ...,
        3.00000000e+00, 3.50000000e+00, 4.50000000e+00],
       [5.15000000e+02, 1.21331251e+09, 3.50000000e+00, ...,
        3.50000000e+00, 3.50000000e+00, 8.50000000e+00],
       [1.62800000e+03, 1.26259858e+09, 4.00000000e+00, ...,
        4.00000000e+00, 3.50000000e+00, 7.20000000e+00],
       ...,
       [1.25160000e+04, 1.20278449e+09, 4.50000000e+00, ...,
        4.00000000e+00, 4.50000000e+00, 9.70000000e+00],
       [1.02790000e+04, 1.27605418e+09, 4.50000000e+00, ...,
        3.00000000e+00, 4.00000000e+00, 7.00000000e+00],
       [1.33070000e+04, 1.26219631e+09, 3.50000000e+00, ...,
        4.00000000e+00, 3.50000000e+00, 8.00000000e+00]])

In [8]:
from src.data.sets import split_sets_by_time, save_sets, load_sets

#X_train, y_train, X_val, y_val, X_test, y_test = load_sets(path="../data/processed/")
#data_reader.plot_correlation(train_df, ["brewery_id", "beer_beerid"])

#target_class = X_train['beer_beerid'].value_counts().reset_index()
# unique, counts = np.unique(y_train, return_counts=True)
# print (np.asarray((unique, counts)).T)
np.unique(y_train)

array([    3,     4,     5, ..., 77315, 77316, 77317])

In [24]:
np.unique(y_val)

array([    3,     4,     5, ..., 77303, 77312, 77314], dtype=int64)

In [25]:
np.unique(y_test)

array([    4,     5,     6, ..., 77293, 77294, 77310], dtype=int64)

### Load Pytorch Dataset

In [9]:
from src.models.pytorch import PytorchDataset

train_dataset = PytorchDataset(X=X_train, y=y_train)
val_dataset = PytorchDataset(X=X_val, y=y_val)
test_dataset = PytorchDataset(X=X_test, y=y_test)

### Baseline Model

In [10]:
from src.models.null import NullModel

baseline_model = NullModel()
y_base = baseline_model.fit_predict(y_train)
y_base

array([[21693.15859252],
       [21693.15859252],
       [21693.15859252],
       ...,
       [21693.15859252],
       [21693.15859252],
       [21693.15859252]])

In [16]:
from src.models.null import NullModel

base_model = NullModel(target_type="classification")
y_base = base_model.fit_predict(y_train)
y_base


array([[2093],
       [2093],
       [2093],
       ...,
       [2093],
       [2093],
       [2093]], dtype=int64)

In [18]:
X_train.shape

(951968, 8)

In [11]:
from src.models.performance import print_reg_perf
print_reg_perf(y_base, y_train, set_name='Training')

RMSE Training: 475624750.9192993
MAE Training: 19130.400158942026


In [17]:
import torch.nn.functional as F

In [12]:
import torch
import torch.nn as nn
from src.models.pytorch import PytorchMultiClass

model = PytorchMultiClass(X_train.shape[1])

In [16]:
from src.models.pytorch import get_device

device = get_device()
model.to(device)

PytorchMultiClass(
  (layer_1): Linear(in_features=8, out_features=80, bias=True)
  (layer_2): Linear(in_features=80, out_features=100, bias=True)
  (layer_out): Linear(in_features=100, out_features=104, bias=True)
  (softmax): Softmax(dim=1)
)

In [14]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

In [19]:
from src.models.pytorch import train_classification, test_classification

N_EPOCHS = 10
BATCH_SIZE = 32

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset, model=model, criterion=criterion, optimizer=optimizer, batch_size=BATCH_SIZE, device=device)
    valid_loss, valid_acc = test_classification(val_dataset, model=model, criterion=criterion, batch_size=BATCH_SIZE, device=device)

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.1f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.1f}%')

IndexError: Target 1959 is out of bounds.