# 1. Set up Environment

In [1]:
%pwd

'/home/jovyan/work/notebooks'

In [2]:
%cd '/home/jovyan/work'

/home/jovyan/work


In [3]:
import os
if os.path.exists("adsi-at2.zip"):
  os.remove("adsi-at2.zip")
if os.path.exists("data_files/raw/beer_reviews.csv"):
  os.remove("data_files/raw/beer_reviews.csv")

In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
import os
import torch.nn as nn
import torch
import matplotlib.pyplot as plt 
import pandas as pd
import numpy as np

In [6]:
torch.manual_seed(2)
pd.options.display.max_rows = 10000

In [7]:
# Download data from Kaggle API, unzip and place in data directory
os.environ['KAGGLE_USERNAME'] = "kallikrates"
os.environ['KAGGLE_KEY'] = "238b7c2704c0169326ee26d23a1d1d7c"
!kaggle datasets download -d kallikrates/adsi-at2
!unzip -q adsi-at2.zip -d /home/jovyan/work/data_files/raw

Downloading adsi-at2.zip to /home/jovyan/work
 98%|█████████████████████████████████████▍| 27.0M/27.4M [00:05<00:00, 5.85MB/s]
100%|██████████████████████████████████████| 27.4M/27.4M [00:05<00:00, 5.61MB/s]


# 2. Load and Explore Data

In [8]:
df = pd.read_csv('data_files/raw/beer_reviews.csv')

In [9]:
df.head()

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883


In [10]:
df.shape

(1586614, 13)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586614 entries, 0 to 1586613
Data columns (total 13 columns):
brewery_id            1586614 non-null int64
brewery_name          1586599 non-null object
review_time           1586614 non-null int64
review_overall        1586614 non-null float64
review_aroma          1586614 non-null float64
review_appearance     1586614 non-null float64
review_profilename    1586266 non-null object
beer_style            1586614 non-null object
review_palate         1586614 non-null float64
review_taste          1586614 non-null float64
beer_name             1586614 non-null object
beer_abv              1518829 non-null float64
beer_beerid           1586614 non-null int64
dtypes: float64(6), int64(3), object(4)
memory usage: 157.4+ MB


In [12]:
df.head()

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883


# 3. Prepare Data

In [13]:
df_cleaned = df.copy()

### Drop unused variables

In [14]:
df_cleaned = df_cleaned.drop(['brewery_id', 'review_time','review_profilename','beer_beerid','beer_name','beer_abv'], axis=1)

### Create Categorical Variable Dictionary

In [15]:
arr_brewery_name = df_cleaned.brewery_name.unique()
arr_beer_style = df_cleaned.beer_style.unique()

In [16]:
lst_brewery_name = list(arr_brewery_name)
lst_beer_style = list(arr_beer_style)

In [17]:
cats_dict = {
    'brewery_name': [lst_brewery_name],
    'beer_style': [lst_beer_style]
}

### Quantify NULL Values

In [18]:
df_cleaned.isnull().sum()

brewery_name         15
review_overall        0
review_aroma          0
review_appearance     0
beer_style            0
review_palate         0
review_taste          0
dtype: int64

In [19]:
df_cleaned.dropna(how='any', inplace=True)

### Transform Categorical column values with encoder

In [20]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

In [21]:
for col, cats in cats_dict.items():
    col_encoder = OrdinalEncoder(categories=cats)
    df_cleaned[col] = col_encoder.fit_transform(df_cleaned[[col]])

In [22]:
num_cols = ['brewery_name','review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste']

In [23]:
target_col = 'beer_style'

In [24]:
sc = StandardScaler()

In [25]:
df_cleaned[num_cols] = sc.fit_transform(df_cleaned[num_cols])

In [26]:
df_cleaned['beer_style'] = df_cleaned['beer_style'].astype(int)

In [47]:
X = df_cleaned

In [48]:
Y = df_cleaned['beer_style']

In [49]:
Y = Y.to_frame()
Y = Y.to_numpy()

### Split Data

# 4. Data Loader

# 5. Model

### Loss Function

### Optimiser

# 6. Define Architecture

In [29]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# 7. Train Model