## Introduction

https://github.com/roger-yu-ds/assignment_2


### Summary

In [16]:
# artefact_prefix = 'pytorch_2'
target = 'beer_style'

In [2]:
%load_ext autoreload
%autoreload 2

In [4]:
from dotenv import find_dotenv
from datetime import datetime
import pandas as pd
from pathlib import Path
import numpy as np

# from src.data.utility import DataReader, BeerData
from src.data.sets import split_sets_random
from src.data.sets import save_sets

In [5]:
from dotenv import find_dotenv, load_dotenv

project_dir = Path(find_dotenv()).parent
data_dir = project_dir / 'data'
raw_data_dir = data_dir / 'raw'
interim_data_dir = data_dir / 'interim'
processed_data_dir = data_dir / 'processed'
report_dir = project_dir / 'reports'

### Load Raw Data

In [6]:
path = raw_data_dir / 'beer_reviews.csv'
df = pd.read_csv(path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586614 entries, 0 to 1586613
Data columns (total 13 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   brewery_id          1586614 non-null  int64  
 1   brewery_name        1586599 non-null  object 
 2   review_time         1586614 non-null  int64  
 3   review_overall      1586614 non-null  float64
 4   review_aroma        1586614 non-null  float64
 5   review_appearance   1586614 non-null  float64
 6   review_profilename  1586266 non-null  object 
 7   beer_style          1586614 non-null  object 
 8   review_palate       1586614 non-null  float64
 9   review_taste        1586614 non-null  float64
 10  beer_name           1586614 non-null  object 
 11  beer_abv            1518829 non-null  float64
 12  beer_beerid         1586614 non-null  int64  
dtypes: float64(6), int64(3), object(4)
memory usage: 157.4+ MB


In [14]:
df.head(2)
# print(df.shape) # (1586614, 13)

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213


In [10]:
df['brewery_name'].nunique()

5742

In [17]:
df_cleaned = df.copy()

col_list = [target] + [
    'brewery_name',
    'review_aroma',
    'review_appearance',
    'review_palate',
    'review_taste'
]
df_subset = df[col_list]

In [9]:
# df_subset.describe()

Unnamed: 0,review_aroma,review_appearance,review_palate,review_taste
count,1586614.0,1586614.0,1586614.0,1586614.0
mean,3.735636,3.841642,3.743701,3.79286
std,0.6976167,0.6160928,0.6822184,0.7319696
min,1.0,0.0,1.0,1.0
25%,3.5,3.5,3.5,3.5
50%,4.0,4.0,4.0,4.0
75%,4.0,4.0,4.0,4.5
max,5.0,5.0,5.0,5.0


In [18]:
df_subset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586614 entries, 0 to 1586613
Data columns (total 6 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   beer_style         1586614 non-null  object 
 1   brewery_name       1586599 non-null  object 
 2   review_aroma       1586614 non-null  float64
 3   review_appearance  1586614 non-null  float64
 4   review_palate      1586614 non-null  float64
 5   review_taste       1586614 non-null  float64
dtypes: float64(4), object(2)
memory usage: 72.6+ MB


### Save Subset Data

In [43]:
path = processed_data_dir / 'subset'
df_subset.to_csv(path.with_suffix('.csv'), index=False)
# df_subset.to_parquet(path.with_suffix('.parquet'), index=False)

### Split Data

In [44]:
X_train, y_train, X_val, y_val, X_test, y_test = split_sets_random(
    df=df_subset,
    target_col=target,
    test_ratio=0.2,
    to_numpy=False
)

In [45]:
save_sets(X_train=X_train,
          X_test=X_test,
          X_val=X_val,
          y_train=y_train,
          y_test=y_test,
          y_val=y_val,
          path=processed_data_dir)

In [46]:
X_train, X_test, X_val, y_train, y_test, y_val = load_sets()

In [11]:
df_subset['brewery_name'].nunique()

5742

### Preprocess data
The brewery_name is a feature with a very high cardinality, ~5700. One hot encoding is not feasible as it will introduce 5700 very sparse columns. Another option is to use binary encoding, which would result in 14 new columns.
Standard scaling is used to ensure that the binary columns ([0, 1])and the review columns ([1, 5]) are on the same scale.

In [2]:
# pipe = Pipeline([
#     ('bin_encoder', BinaryEncoder(cols=['brewery_name'])),
#     ('scaler', StandardScaler())
# ])

In [1]:
# X_train_trans = pipe.fit_transform(X_train)
# X_val_trans = pipe.transform(X_val)
# X_test_trans = pipe.transform(X_test)