# BEER TYPE PREDICTION 
## ONLY API PREDICTORS USED WITH PIPELINES
### Split datasets first --> then transform

# 1. Load Packages

In [1]:
# Load the packages needed for Logistic Regression Modelling upfront
import pandas as pd
import numpy as np

import category_encoders as ce

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

from joblib import dump

In [2]:
import platform; print(platform.platform())
import sys; print("Python", sys.version)
import pandas; print("Pandas", pandas.__version__)
import numpy; print("NumPy", numpy.__version__)
import sklearn; print("Scikit-Learn", sklearn.__version__)

macOS-12.3-arm64-arm-64bit
Python 3.8.12 | packaged by conda-forge | (default, Jan 30 2022, 23:13:24) 
[Clang 11.1.0 ]
Pandas 1.4.1
NumPy 1.22.3
Scikit-Learn 1.0.2


# 2. Load the data

In [4]:
# Load the training data set
data = pd.read_csv('../data/raw/beer_reviews.csv')

#data = pd.read_csv('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data_folder/raw/beer_reviews.csv')

# 3. Explore the Data

In [5]:
# check rows and columns of the training set
data.shape

(1586614, 13)

In [6]:
data.head()

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586614 entries, 0 to 1586613
Data columns (total 13 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   brewery_id          1586614 non-null  int64  
 1   brewery_name        1586599 non-null  object 
 2   review_time         1586614 non-null  int64  
 3   review_overall      1586614 non-null  float64
 4   review_aroma        1586614 non-null  float64
 5   review_appearance   1586614 non-null  float64
 6   review_profilename  1586266 non-null  object 
 7   beer_style          1586614 non-null  object 
 8   review_palate       1586614 non-null  float64
 9   review_taste        1586614 non-null  float64
 10  beer_name           1586614 non-null  object 
 11  beer_abv            1518829 non-null  float64
 12  beer_beerid         1586614 non-null  int64  
dtypes: float64(6), int64(3), object(4)
memory usage: 157.4+ MB


In [8]:
data.columns

Index(['brewery_id', 'brewery_name', 'review_time', 'review_overall',
       'review_aroma', 'review_appearance', 'review_profilename', 'beer_style',
       'review_palate', 'review_taste', 'beer_name', 'beer_abv',
       'beer_beerid'],
      dtype='object')

In [9]:
data.describe()

Unnamed: 0,brewery_id,review_time,review_overall,review_aroma,review_appearance,review_palate,review_taste,beer_abv,beer_beerid
count,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1518829.0,1586614.0
mean,3130.099,1224089000.0,3.815581,3.735636,3.841642,3.743701,3.79286,7.042387,21712.79
std,5578.104,76544270.0,0.7206219,0.6976167,0.6160928,0.6822184,0.7319696,2.322526,21818.34
min,1.0,840672000.0,0.0,1.0,0.0,1.0,1.0,0.01,3.0
25%,143.0,1173224000.0,3.5,3.5,3.5,3.5,3.5,5.2,1717.0
50%,429.0,1239203000.0,4.0,4.0,4.0,4.0,4.0,6.5,13906.0
75%,2372.0,1288568000.0,4.5,4.0,4.0,4.0,4.5,8.5,39441.0
max,28003.0,1326285000.0,5.0,5.0,5.0,5.0,5.0,57.7,77317.0


In [10]:
# count the beer types in the data set
data['beer_style'].nunique()

104

In [11]:
# check the spread of observations
data['beer_style'].value_counts()

American IPA                        117586
American Double / Imperial IPA       85977
American Pale Ale (APA)              63469
Russian Imperial Stout               54129
American Double / Imperial Stout     50705
                                     ...  
Gose                                   686
Faro                                   609
Roggenbier                             466
Kvass                                  297
Happoshu                               241
Name: beer_style, Length: 104, dtype: int64

In [12]:
# check for missing/null values in the dataset
data.isnull().sum()

brewery_id                0
brewery_name             15
review_time               0
review_overall            0
review_aroma              0
review_appearance         0
review_profilename      348
beer_style                0
review_palate             0
review_taste              0
beer_name                 0
beer_abv              67785
beer_beerid               0
dtype: int64

# 4. Transform the Data

## 4.1 Create a copy of the raw dataset

In [13]:
# create a copy of dataset for transformations
df = data.copy()

#df.head()

## 4.2 Drop all rows with missing values

In [14]:
# check for missing values in the dataset
df.isnull().sum()

brewery_id                0
brewery_name             15
review_time               0
review_overall            0
review_aroma              0
review_appearance         0
review_profilename      348
beer_style                0
review_palate             0
review_taste              0
beer_name                 0
beer_abv              67785
beer_beerid               0
dtype: int64

In [15]:
# Drop rows with any empty cells
df = df.dropna(
    axis=0,
    how='any',
    inplace=False
)

In [16]:
df.head()

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883


In [17]:
df.shape
# dropped 68,136 rows

(1518478, 13)

In [18]:
data.shape


(1586614, 13)

In [19]:
# confirm it worked
df.isnull().sum()

brewery_id            0
brewery_name          0
review_time           0
review_overall        0
review_aroma          0
review_appearance     0
review_profilename    0
beer_style            0
review_palate         0
review_taste          0
beer_name             0
beer_abv              0
beer_beerid           0
dtype: int64

In [20]:
#df.to_csv('../data/interim/1_beer_reviews_NO-NULL.csv', index=False)

## 4.3 Drop the 2 Identifier columns and save in the interim folder

In [21]:
df.drop(columns=['brewery_id', 'beer_beerid'], axis = 1, inplace=True)

df.head()

Unnamed: 0,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv
0,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0
1,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2
2,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5
3,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0
4,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7


In [22]:
#df.to_csv('../data/interim/2_beer_reviews_noIDcolumns.csv', index=False)

## 4.4 Drop columns that have no prediction impact like reviewer details and 'beer_name'

In [23]:
drop_col = ['review_time', 'review_overall', 'review_profilename', 'beer_name']

df.drop(drop_col, axis = 1, inplace=True)

df.head()

Unnamed: 0,brewery_name,review_aroma,review_appearance,beer_style,review_palate,review_taste,beer_abv
0,Vecchio Birraio,2.0,2.5,Hefeweizen,1.5,1.5,5.0
1,Vecchio Birraio,2.5,3.0,English Strong Ale,3.0,3.0,6.2
2,Vecchio Birraio,2.5,3.0,Foreign / Export Stout,3.0,3.0,6.5
3,Vecchio Birraio,3.0,3.5,German Pilsener,2.5,3.0,5.0
4,Caldera Brewing Company,4.5,4.0,American Double / Imperial IPA,4.0,4.5,7.7


In [24]:
#df.to_csv('../data/interim/3_beer_reviews_APIpredictorsONLY.csv', index=False)

## 4.5 Remove SPECIAL CHARACTERS From Name Column - OPTIONAL

In [25]:
# this should remove everything but keep whitespace
# https://stackoverflow.com/questions/42676162/remove-symbols-from-string-but-keep-whitespaces
# "[^a-zA-Z\s]+" allows to recognise and keep alphabets and whitespace, hence removing everything else. 
# Similarly "[^a-zA-Z0-9\s]+" would also keep digits and remove all other characters/symbols

df['brewery_name'] = df['brewery_name'].str.replace("[^a-zA-Z\s]+", '', regex=True)

In [26]:
#df.to_csv('../data/interim/4_beer_reviews_APIpredictors_NoChar.csv', index=False)

## 4.6 Split up the Datasets

In [27]:
df_cleaned = df.copy()

In [28]:
# extract the target variable out as y = target
target = pd.DataFrame(df_cleaned.pop('beer_style'))
target.shape

(1518478, 1)

In [29]:
target.head()

Unnamed: 0,beer_style
0,Hefeweizen
1,English Strong Ale
2,Foreign / Export Stout
3,German Pilsener
4,American Double / Imperial IPA


In [30]:
X_train, X_test, y_train, y_test = train_test_split(df_cleaned, target, test_size=0.3, random_state = 42)

In [31]:
X_train.shape

(1062934, 6)

In [32]:
X_test.shape

(455544, 6)

In [33]:
y_train.shape

(1062934, 1)

In [34]:
y_test.shape

(455544, 1)

In [35]:
X_train.head()

Unnamed: 0,brewery_name,review_aroma,review_appearance,review_palate,review_taste,beer_abv
115503,HackerPschorr Bru GmbH,3.0,3.0,3.0,3.0,5.0
398865,Brasserie Caracole,4.0,4.5,4.5,4.0,8.0
564379,Berkshire Brewing Company Inc,3.0,4.0,3.5,4.0,4.2
1162176,Dogfish Head Brewery,3.5,4.0,4.0,3.5,5.2
1452677,Kulmbacher Brauerei AG,3.0,4.0,4.0,4.0,5.4


## 4.7 Save the processed data files

In [36]:
# save the processed datasets
#np.save('../data/processed/X_train', X_train)
#np.save('../data/processed/X_test',  X_test)
#np.save('../data/processed/y_train', y_train)
#np.save('../data/processed/y_test',  y_test)

In [37]:
# save csv files too
#X_train.to_csv('../data/processed/X_train.csv', index=False)
#X_test.to_csv('../data/processed/X_test.csv', index=False)

In [38]:
#y_train.to_csv('../data/processed/y_train.csv', index=False)
#y_test.to_csv('../data/processed/y_test.csv', index=False)

## 4.8 Build Pipelines for Standard Scaler and One Hot Encoding

### ONLY FOR TRAINING SETS

In [39]:
from sklearn.pipeline import Pipeline

In [40]:
# scale the predictors
num_transformer = Pipeline(
    steps=[
        ('scaler', StandardScaler())
    ]
)

In [41]:
# for predictor only
cat_transformer = Pipeline(
    steps=[
        ('one_hot_encoder', ce.BinaryEncoder())
    ]
)

In [42]:
# https://stackoverflow.com/questions/50473381/scikit-learns-labelbinarizer-vs-onehotencoder

# integer encode the target variable now
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_train

  y = column_or_1d(y, warn=True)


array([61, 26, 73, ..., 68,  4, 58])

In [64]:
# integer encode the target variable now
y_test = encoder.fit_transform(y_test)
y_test

  y = column_or_1d(y, warn=True)


array([14, 24,  1, ..., 14, 19, 47])

In [43]:
# Create a list of numeric and categorical columns

num_cols = ['review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv']
cat_cols = ['brewery_name']

In [44]:
from sklearn.compose import ColumnTransformer

In [45]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num_cols', num_transformer, num_cols),
        ('cat_cols', cat_transformer, cat_cols)
    ]
)

# 5. Train a Multinomial Logistic Regression Model

In [46]:
model = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('mlr', LogisticRegression(multi_class="multinomial", solver='lbfgs'))
    ]
)

In [47]:
model.fit(X_train, y_train)
# takes 5min to run

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num_cols',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['review_aroma',
                                                   'review_appearance',
                                                   'review_palate',
                                                   'review_taste',
                                                   'beer_abv']),
                                                 ('cat_cols',
                                                  Pipeline(steps=[('one_hot_encoder',
                                                                   BinaryEncoder())]),
                                                  ['brewery_name'])])),
                ('mlr', LogisticRegression(multi_class='multinomial'))])

In [51]:
#dump(model, '../models/mlr_scaled_6Pred_Binary-Label_Pipeline.joblib')

#dump(model, '/Users/jasle1/Desktop/MDSI/ADSI/AT_2/api/models/mlr_scaled_6Pred_Binary-Label_Pipeline.joblib')

# 6. Evaluate Model Performance

In [48]:
test_pred = model.predict(X_test)
test_pred

array([12, 12, 65, ..., 12,  9, 14])

In [49]:
accuracy_score(y_train, model.predict(X_train))

0.1901801993350481

In [50]:
obs = pd.DataFrame(X_train.iloc[0]).transpose()
model.predict(obs)

array([60])

# 7. Revert to Beer_Type Labels

In [54]:
labels = encoder.inverse_transform(test_pred)
labels

array(['American IPA', 'American IPA', 'Hefeweizen', ..., 'American IPA',
       'American Double / Imperial IPA', 'American Pale Ale (APA)'],
      dtype=object)

In [66]:
test_pred = pd.DataFrame(test_pred)
test_pred = test_pred.rename(columns={0: 'Predictions'})
test_pred

Unnamed: 0,Predictions
0,12
1,12
2,65
3,12
4,14
...,...
455539,12
455540,98
455541,12
455542,9


In [61]:
# Transform Predictions back to names
# https://scikit-learn.org/stable/modules/preprocessing_targets.html
# https://discuss.analyticsvidhya.com/t/labelencoder-how-to-reverse-it/11497/2
list = encoder.inverse_transform(test_pred)
list = pd.DataFrame(list)
list = list.rename(columns={0: 'Predicted_beer_style'})
list.head()

  y = column_or_1d(y, warn=True)


Unnamed: 0,Predicted_beer_style
0,American IPA
1,American IPA
2,Hefeweizen
3,American IPA
4,American Pale Ale (APA)


In [65]:
# Transform y_test back to names
ylist = encoder.inverse_transform(y_test)
ylist = pd.DataFrame(ylist)
ylist = ylist.rename(columns={0: 'Beer_style'})
ylist.head()

Unnamed: 0,Beer_style
0,American Pale Ale (APA)
1,Belgian Pale Ale
2,American Adjunct Lager
3,Hefeweizen
4,American Wild Ale


In [67]:
# Now, merge the test data set with predictions data
df_final = pd.concat([ylist,test_pred, list], axis=1)
df_final.head(20)

Unnamed: 0,Beer_style,Predictions,Predicted_beer_style
0,American Pale Ale (APA),12,American IPA
1,Belgian Pale Ale,12,American IPA
2,American Adjunct Lager,65,Hefeweizen
3,Hefeweizen,12,American IPA
4,American Wild Ale,14,American Pale Ale (APA)
5,Munich Dunkel Lager,60,Fruit / Vegetable Beer
6,American Strong Ale,12,American IPA
7,Hefeweizen,16,American Pale Wheat Ale
8,Bière de Garde,9,American Double / Imperial IPA
9,Weizenbock,9,American Double / Imperial IPA


In [69]:
#df_final.to_csv('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data/processed/predictions.csv', index=False)

# 8. Test the App

#### If Docker is Running,  http://localhost:8080/docs

#### otherwise, https://adsi-at2.herokuapp.com/docs

In [None]:
# either enter these values manually
brewery_name		            review_aroma	review_appearance	review_palate	review_taste	beer_abv       beer_style
Vecchio Birraio		                2	              2.5		          1.5	           1.5	        5	       Hefeweizen
Pacific Coast Brewing Company	    4	               4	               4	            4	       10	       American Double / Imperial Stout
Caldera Brewing Company		     	4	               4		           4	            4	        7         American Strong Ale
Fitgers Brewhouse	                4	               4	               4	            4	       5.9

#### OR Launch these links - for multiple predictors

https://adsi-at2.herokuapp.com/beer/type/multiple/predictors?name=Vecchio%20Birraio&aroma=2&appearance=2.5&palate=1.5&taste=1.5&volume=5

https://adsi-at2.herokuapp.com/beer/type/multiple/predictors?name=Pacific%20Coast%20Brewing%20Company&aroma=4&appearance=4&palate=4&taste=4&volume=10

https://adsi-at2.herokuapp.com/beer/type/multiple/predictors?name=Caldera%20Brewing%20Company&aroma=4&appearance=4&palate=4&taste=4&volume=7

https://adsi-at2.herokuapp.com/beer/type/multiple/predictors?name=%27t%20Hofbrouwerijke&aroma=4&appearance=3&palate=2.5&taste=4&volume=8.5

https://adsi-at2.herokuapp.com/beer/type/multiple/predictors?name=1648%20Brewing%20Company%20Ltd&aroma=4&appearance=3.5&palate=3.5&taste=4&volume=3.7

Example from test set - 

https://adsi-at2.herokuapp.com/beer/type/multiple/predictors?name=Fitgers%20Brewhouse&aroma=4&appearance=4&palate=4&taste=4&volume=5.9

#### for single predictor

https://adsi-at2.herokuapp.com/beer/type/single/predictor?name=Caldera%20Brewing%20Company