# BEER TYPE PREDICTION - ONLY API PREDICTORS USED

# 1. Load Packages

In [1]:
# Load the packages needed for Logistic Regression Modelling upfront
import pandas as pd
import numpy as np

import category_encoders as ce

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

from joblib import dump

In [2]:
print(pd.__version__)

1.4.1


# 2. Load the data

In [3]:
# Load the training data set
data = pd.read_csv('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data/raw/beer_reviews.csv')

# 3. Explore the Data

In [4]:
# check rows and columns of the training set
data.shape

(1586614, 13)

In [5]:
data.head()

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586614 entries, 0 to 1586613
Data columns (total 13 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   brewery_id          1586614 non-null  int64  
 1   brewery_name        1586599 non-null  object 
 2   review_time         1586614 non-null  int64  
 3   review_overall      1586614 non-null  float64
 4   review_aroma        1586614 non-null  float64
 5   review_appearance   1586614 non-null  float64
 6   review_profilename  1586266 non-null  object 
 7   beer_style          1586614 non-null  object 
 8   review_palate       1586614 non-null  float64
 9   review_taste        1586614 non-null  float64
 10  beer_name           1586614 non-null  object 
 11  beer_abv            1518829 non-null  float64
 12  beer_beerid         1586614 non-null  int64  
dtypes: float64(6), int64(3), object(4)
memory usage: 157.4+ MB


In [7]:
data.columns

Index(['brewery_id', 'brewery_name', 'review_time', 'review_overall',
       'review_aroma', 'review_appearance', 'review_profilename', 'beer_style',
       'review_palate', 'review_taste', 'beer_name', 'beer_abv',
       'beer_beerid'],
      dtype='object')

In [8]:
data.describe()

Unnamed: 0,brewery_id,review_time,review_overall,review_aroma,review_appearance,review_palate,review_taste,beer_abv,beer_beerid
count,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1518829.0,1586614.0
mean,3130.099,1224089000.0,3.815581,3.735636,3.841642,3.743701,3.79286,7.042387,21712.79
std,5578.104,76544270.0,0.7206219,0.6976167,0.6160928,0.6822184,0.7319696,2.322526,21818.34
min,1.0,840672000.0,0.0,1.0,0.0,1.0,1.0,0.01,3.0
25%,143.0,1173224000.0,3.5,3.5,3.5,3.5,3.5,5.2,1717.0
50%,429.0,1239203000.0,4.0,4.0,4.0,4.0,4.0,6.5,13906.0
75%,2372.0,1288568000.0,4.5,4.0,4.0,4.0,4.5,8.5,39441.0
max,28003.0,1326285000.0,5.0,5.0,5.0,5.0,5.0,57.7,77317.0


In [9]:
# count the beer types in the data set
data['beer_style'].nunique()

104

In [10]:
# check the spread of observations
data['beer_style'].value_counts()

American IPA                        117586
American Double / Imperial IPA       85977
American Pale Ale (APA)              63469
Russian Imperial Stout               54129
American Double / Imperial Stout     50705
                                     ...  
Gose                                   686
Faro                                   609
Roggenbier                             466
Kvass                                  297
Happoshu                               241
Name: beer_style, Length: 104, dtype: int64

In [11]:
# check for missing/null values in the dataset
data.isnull().sum()

brewery_id                0
brewery_name             15
review_time               0
review_overall            0
review_aroma              0
review_appearance         0
review_profilename      348
beer_style                0
review_palate             0
review_taste              0
beer_name                 0
beer_abv              67785
beer_beerid               0
dtype: int64

# 4. Transform the Data

## 4.1 Create a copy of the raw dataset

In [12]:
# create a copy of dataset for transformations
df = data.copy()

#df.head()

## 4.2 Drop all rows with missing values

In [13]:
# check for missing values in the dataset
df.isnull().sum()

brewery_id                0
brewery_name             15
review_time               0
review_overall            0
review_aroma              0
review_appearance         0
review_profilename      348
beer_style                0
review_palate             0
review_taste              0
beer_name                 0
beer_abv              67785
beer_beerid               0
dtype: int64

In [14]:
# Drop rows with any empty cells
df = df.dropna(
    axis=0,
    how='any',
    inplace=False
)

In [15]:
df.head()

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883


In [16]:
df.shape
# dropped 68,136 rows

(1518478, 13)

In [17]:
data.shape


(1586614, 13)

In [18]:
# confirm it worked
df.isnull().sum()

brewery_id            0
brewery_name          0
review_time           0
review_overall        0
review_aroma          0
review_appearance     0
review_profilename    0
beer_style            0
review_palate         0
review_taste          0
beer_name             0
beer_abv              0
beer_beerid           0
dtype: int64

In [19]:
#df.to_csv('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data/interim/1_beer_reviews_APIpredictorsONLY_NO-NULL.csv', index=False)

## 4.3 Drop the 2 Identifier columns and save in the interim folder

In [20]:
df.drop(columns=['brewery_id', 'beer_beerid'], axis = 1, inplace=True)

df.head()

Unnamed: 0,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv
0,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0
1,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2
2,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5
3,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0
4,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7


In [21]:
#df.to_csv('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data/interim/2_beer_reviews_noIDcolumns.csv', index=False)

## 4.4 Drop columns that have no prediction impact like reviewer details and 'beer_name'

In [22]:
drop_col = ['review_time', 'review_overall', 'review_profilename', 'beer_name']

df.drop(drop_col, axis = 1, inplace=True)

df.head()

Unnamed: 0,brewery_name,review_aroma,review_appearance,beer_style,review_palate,review_taste,beer_abv
0,Vecchio Birraio,2.0,2.5,Hefeweizen,1.5,1.5,5.0
1,Vecchio Birraio,2.5,3.0,English Strong Ale,3.0,3.0,6.2
2,Vecchio Birraio,2.5,3.0,Foreign / Export Stout,3.0,3.0,6.5
3,Vecchio Birraio,3.0,3.5,German Pilsener,2.5,3.0,5.0
4,Caldera Brewing Company,4.5,4.0,American Double / Imperial IPA,4.0,4.5,7.7


In [23]:
#df.to_csv('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data/interim/3_beer_reviews_APIpredictorsONLY.csv', index=False)

## 4.5 Remove SPECIAL CHARACTERS From Name Column

In [24]:
# this should remove everything but keep whitespace
# https://stackoverflow.com/questions/42676162/remove-symbols-from-string-but-keep-whitespaces
# "[^a-zA-Z\s]+" allows to recognise and keep alphabets and whitespace, hence removing everything else. 
# Similarly "[^a-zA-Z0-9\s]+" would also keep digits and remove all other characters/symbols

df['brewery_name'] = df['brewery_name'].str.replace("[^a-zA-Z\s]+", '', regex=True)

In [25]:
#df.to_csv('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data/interim/4_beer_reviews_APIpredictors_NoChar.csv', index=False)

## 4.6 Standardise the Numeric columns in the dataset

In [26]:
df_cleaned = df.copy()

In [27]:
df_cleaned.columns

Index(['brewery_name', 'review_aroma', 'review_appearance', 'beer_style',
       'review_palate', 'review_taste', 'beer_abv'],
      dtype='object')

In [28]:
# Create a list of numeric columns

num_cols = ['review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv']

In [29]:
# Scale the numeric columns in the training dataset
scaler = StandardScaler()
df_cleaned[num_cols] = scaler.fit_transform(df_cleaned[num_cols])

In [30]:
df_cleaned.head()

Unnamed: 0,brewery_name,review_aroma,review_appearance,beer_style,review_palate,review_taste,beer_abv
0,Vecchio Birraio,-2.511302,-2.19821,Hefeweizen,-3.317561,-3.162309,-0.87941
1,Vecchio Birraio,-1.792233,-1.384289,English Strong Ale,-1.109519,-1.103587,-0.36274
2,Vecchio Birraio,-1.792233,-1.384289,Foreign / Export Stout,-1.109519,-1.103587,-0.233573
3,Vecchio Birraio,-1.073164,-0.570368,German Pilsener,-1.845533,-1.103587,-0.87941
4,Caldera Brewing Company,1.084042,0.243553,American Double / Imperial IPA,0.36251,0.955134,0.283097


## 4.7 One Hot Encode the Categorical Variable (except target variable)

In [31]:
cat_cols = ['brewery_name'] #, 'beer_style']

#Create object for binary encoding
encoder= ce.BinaryEncoder(cols=cat_cols,return_df=True)

#Create object for ordinal encoding
#encoder= ce.OrdinalEncoder(cols=cat_cols,return_df=True)

In [32]:
#Fit and Transform Data 
df_cleaned = encoder.fit_transform(df_cleaned) 
df_cleaned

Unnamed: 0,brewery_name_0,brewery_name_1,brewery_name_2,brewery_name_3,brewery_name_4,brewery_name_5,brewery_name_6,brewery_name_7,brewery_name_8,brewery_name_9,brewery_name_10,brewery_name_11,brewery_name_12,review_aroma,review_appearance,beer_style,review_palate,review_taste,beer_abv
0,0,0,0,0,0,0,0,0,0,0,0,0,1,-2.511302,-2.198210,Hefeweizen,-3.317561,-3.162309,-0.879410
1,0,0,0,0,0,0,0,0,0,0,0,0,1,-1.792233,-1.384289,English Strong Ale,-1.109519,-1.103587,-0.362740
2,0,0,0,0,0,0,0,0,0,0,0,0,1,-1.792233,-1.384289,Foreign / Export Stout,-1.109519,-1.103587,-0.233573
3,0,0,0,0,0,0,0,0,0,0,0,0,1,-1.073164,-0.570368,German Pilsener,-1.845533,-1.103587,-0.879410
4,0,0,0,0,0,0,0,0,0,0,0,1,0,1.084042,0.243553,American Double / Imperial IPA,0.362510,0.955134,0.283097
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1586609,1,0,1,0,0,0,0,0,1,0,0,1,0,0.364974,-0.570368,Pumpkin Ale,0.362510,0.268894,-0.793298
1586610,1,0,1,0,0,0,0,0,1,0,0,1,0,1.803111,-2.198210,Pumpkin Ale,-2.581547,0.268894,-0.793298
1586611,1,0,1,0,0,0,0,0,1,0,0,1,0,-0.354095,-1.384289,Pumpkin Ale,-0.373505,0.268894,-0.793298
1586612,1,0,1,0,0,0,0,0,1,0,0,1,0,1.084042,1.057473,Pumpkin Ale,1.098524,0.955134,-0.793298


## 4.8 Now, Label Encode the Target Variable

In [33]:
df_cleaned.columns

Index(['brewery_name_0', 'brewery_name_1', 'brewery_name_2', 'brewery_name_3',
       'brewery_name_4', 'brewery_name_5', 'brewery_name_6', 'brewery_name_7',
       'brewery_name_8', 'brewery_name_9', 'brewery_name_10',
       'brewery_name_11', 'brewery_name_12', 'review_aroma',
       'review_appearance', 'beer_style', 'review_palate', 'review_taste',
       'beer_abv'],
      dtype='object')

In [34]:
# https://stackoverflow.com/questions/50473381/scikit-learns-labelbinarizer-vs-onehotencoder

# integer encode the target variable
label_encoder = LabelEncoder()
df_cleaned['beer_style'] = label_encoder.fit_transform(df_cleaned['beer_style'])
print(df_cleaned['beer_style'])

0          65
1          51
2          59
3          61
4           9
           ..
1586609    85
1586610    85
1586611    85
1586612    85
1586613    85
Name: beer_style, Length: 1518478, dtype: int64


In [35]:
df_cleaned.head()

Unnamed: 0,brewery_name_0,brewery_name_1,brewery_name_2,brewery_name_3,brewery_name_4,brewery_name_5,brewery_name_6,brewery_name_7,brewery_name_8,brewery_name_9,brewery_name_10,brewery_name_11,brewery_name_12,review_aroma,review_appearance,beer_style,review_palate,review_taste,beer_abv
0,0,0,0,0,0,0,0,0,0,0,0,0,1,-2.511302,-2.19821,65,-3.317561,-3.162309,-0.87941
1,0,0,0,0,0,0,0,0,0,0,0,0,1,-1.792233,-1.384289,51,-1.109519,-1.103587,-0.36274
2,0,0,0,0,0,0,0,0,0,0,0,0,1,-1.792233,-1.384289,59,-1.109519,-1.103587,-0.233573
3,0,0,0,0,0,0,0,0,0,0,0,0,1,-1.073164,-0.570368,61,-1.845533,-1.103587,-0.87941
4,0,0,0,0,0,0,0,0,0,0,0,1,0,1.084042,0.243553,9,0.36251,0.955134,0.283097


## 4.9 Split up the Datasets

In [36]:
# extract the target variable out as y = target
target = pd.DataFrame(df_cleaned.pop('beer_style'))
target.shape

(1518478, 1)

In [37]:
target.head()

Unnamed: 0,beer_style
0,65
1,51
2,59
3,61
4,9


In [38]:
X_train, X_test, y_train, y_test = train_test_split(df_cleaned, target, test_size=0.3, random_state = 42)

In [39]:
# make a copy of test set to be used later
X_test_copy = X_test.copy

In [40]:
X_train.shape

(1062934, 18)

In [41]:
X_test.shape

(455544, 18)

In [42]:
y_train.shape

(1062934, 1)

In [43]:
y_test.shape

(455544, 1)

## 4.10 Save the processed data files

In [44]:
# save the processed datasets
#np.save('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data/processed/X_train', X_train)
#np.save('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data/processed/X_test',  X_test)
#np.save('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data/processed/y_train', y_train)
#np.save('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data/processed/y_test',  y_test)

In [45]:
# save csv files too
#X_train.to_csv('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data/processed/X_train.csv', index=False)
#X_test.to_csv('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data/processed/X_test.csv', index=False)

In [46]:
#y_train.to_csv('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data/processed/y_train.csv', index=False)
#y_test.to_csv('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data/processed/y_test.csv', index=False)

# 5. Train a Multinomial Logistic Regression Model

In [47]:
X_test.head()

Unnamed: 0,brewery_name_0,brewery_name_1,brewery_name_2,brewery_name_3,brewery_name_4,brewery_name_5,brewery_name_6,brewery_name_7,brewery_name_8,brewery_name_9,brewery_name_10,brewery_name_11,brewery_name_12,review_aroma,review_appearance,review_palate,review_taste,beer_abv
1565772,1,0,0,1,1,1,0,0,1,1,0,0,0,0.364974,0.243553,0.36251,0.268894,-0.491907
772327,0,1,0,0,1,0,0,0,1,0,0,0,1,1.084042,1.057473,0.36251,0.955134,-0.233573
328681,0,0,0,1,1,1,0,1,0,0,1,0,1,0.364974,0.243553,0.36251,1.641374,-1.094689
810901,0,1,0,0,1,0,1,1,0,0,0,0,1,-0.354095,0.243553,-1.109519,-0.417347,-0.621075
454769,0,0,1,0,1,0,0,0,0,0,0,0,0,0.364974,1.057473,1.098524,0.955134,-0.965521


In [48]:
y_test.head()

Unnamed: 0,beer_style
1565772,14
772327,24
328681,1
810901,65
454769,20


In [49]:
# https://machinelearningmastery.com/multinomial-logistic-regression-with-python/
# define the multinomial logistic regression model with a penalty
model = LogisticRegression(multi_class="multinomial", solver='lbfgs') #, max_iter=1000, penalty='l2', C=1.0) #, class_weight='balanced')

# https://stackoverflow.com/questions/62658215/convergencewarning-lbfgs-failed-to-converge-status-1-stop-total-no-of-iter
# Convergence warning below in 'model.fit' can be resolved by adding 'max_iter=1000' here, as above

# https://stackoverflow.com/questions/61814494/what-is-this-warning-convergencewarning-lbfgs-failed-to-converge-status-1
# default max_iter = 100

In [50]:
# https://stackoverflow.com/questions/58313842/a-column-vector-y-was-passed-when-a-1d-array-was-expected-error-message
# Model was throwing an array warning so added 'np.ravel()' to the target dataset --> 
model.fit(X_train, np.ravel(y_train))
#model.fit(X_train, y_train)

# ran for over 9min with default max_iter=100 and no class_weight
# ran for 60min with 1000 iterations and balanced class_weight, but the model performance decreased.
# should have just tried 300 iters --> ran for 33min, but wasn't enough, accuracy same as 100 iterations

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(multi_class='multinomial')

In [51]:
# *** save the models for API use
dump(model, '../models/mlr_scaled_6Pred_NoChar_NADropped_Binary-LabelEncoded.joblib')

#dump(model, '/Users/jasle1/Desktop/MDSI/ADSI/AT_2/api/models/mlr_scaled_6Pred_NoChar_NADropped_Binary-LabelEncoded.joblib')

['/Users/jasle1/Desktop/MDSI/ADSI/AT_2/api/models/mlr_scaled_6Pred_NoChar_NADropped_Binary-LabelEncoded.joblib']

# 6. Evaluate Model Performance

# 6.1 Accuracy Score

In [52]:
# https://dataaspirant.com/implement-multinomial-logistic-regression-python/
print ("Multinomial Logistic regression Train Accuracy :: ", accuracy_score(y_train, model.predict(X_train)))
print ("Multinomial Logistic regression Test Accuracy :: ", accuracy_score(y_test, model.predict(X_test)))

# accuracy decreased from 25% to 18% when 4 additional parameters were added to the regressor
# when brewery name and target variables were both label encoded, and only 6 API predictors were kept, train and test accuracy dropped from 25% to 7%

Multinomial Logistic regression Train Accuracy ::  0.2057757113800104
Multinomial Logistic regression Test Accuracy ::  0.20614254605482676


In [53]:
obs = pd.DataFrame(X_train.iloc[315]).transpose()
model.predict(obs)

array([19])

## 6.2 AUROC Score

In [54]:
# Make predictions on the test dataset
test_prob = model.predict_proba(X_test)
test_prob

array([[1.04945195e-02, 1.25816277e-03, 9.59981472e-02, ...,
        6.43768377e-06, 1.26730531e-02, 5.29842257e-02],
       [4.80120172e-03, 1.21773229e-04, 3.54849611e-02, ...,
        3.05000299e-05, 1.45888530e-02, 1.34305066e-02],
       [1.43744152e-03, 3.81387442e-03, 1.24571185e-02, ...,
        2.99900019e-07, 2.44379358e-03, 2.00480372e-02],
       ...,
       [9.13422625e-03, 1.74993191e-03, 2.22397023e-02, ...,
        1.90646840e-04, 6.29011290e-02, 1.61056115e-02],
       [1.39795520e-04, 2.96809725e-06, 7.77603462e-03, ...,
        3.12287397e-03, 1.60424077e-02, 1.78489410e-04],
       [3.18474840e-03, 9.25689505e-03, 1.44729114e-02, ...,
        3.83030923e-07, 4.18073564e-03, 4.02678147e-02]])

In [55]:
# Calculate auc scores for performance evaluation
# AUROC score is one of the preferred metrics for multi-class classification models evaluation
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html#sklearn.metrics.roc_auc_score
auc_score = roc_auc_score(y_test, test_prob, multi_class='ovr')

print(f'The Multinomial Logistic regression ROC AUC score is {auc_score}')

The Multinomial Logistic regression ROC AUC score is 0.8544320501377138


# 7. Test the App

#### If Docker is Running,  http://localhost:8080/docs

#### otherwise, https://adsi-at2.herokuapp.com/docs

In [None]:
# either enter these values manually
brewery_name		            review_aroma	review_appearance	review_palate	review_taste	beer_abv       beer_style
Vecchio Birraio		                2	              2.5		            1.5	           1.5	        5	       Hefeweizen
Pacific Coast Brewing Company	    4	               4	                  4	            4	        10	       American Double / Imperial Stout
Caldera Brewing Company		     	4	               4		              4	            4	         7         American Strong Ale

#### OR Launch these links - for multiple predictors

https://adsi-at2.herokuapp.com/beer/type/multiple/predictors?name=Vecchio%20Birraio&aroma=2&appearance=2.5&palate=1.5&taste=1.5&volume=5

https://adsi-at2.herokuapp.com/beer/type/multiple/predictors?name=Pacific%20Coast%20Brewing%20Company&aroma=4&appearance=4&palate=4&taste=4&volume=10

https://adsi-at2.herokuapp.com/beer/type/multiple/predictors?name=Caldera%20Brewing%20Company&aroma=4&appearance=4&palate=4&taste=4&volume=7

# 7. Transform the Target Variable and its Predictions back to String/Name

In [57]:
# Make predictions on the test dataset
test_pred = model.predict(X_test)
test_pred

array([ 2, 12, 14, ..., 14,  9, 14])

In [58]:
test_pred = pd.DataFrame(test_pred)
test_pred

Unnamed: 0,0
0,2
1,12
2,14
3,12
4,83
...,...
455539,12
455540,9
455541,14
455542,9


In [61]:
# Transform Predictions back to names
# https://scikit-learn.org/stable/modules/preprocessing_targets.html
list = label_encoder.inverse_transform(test_pred)
list = pd.DataFrame(list)
list = list.rename(columns={0: 'Predicted_beer_style'})
list.head()

Unnamed: 0,Predicted_beer_style
0,American Amber / Red Ale
1,American IPA
2,American Pale Ale (APA)
3,American IPA
4,Oatmeal Stout


In [62]:
# Transform y_test back to names
ylist = label_encoder.inverse_transform(y_test)
ylist = pd.DataFrame(ylist)
ylist = ylist.rename(columns={0: 'Beer_style'})
ylist.head()

  y = column_or_1d(y, warn=True)


Unnamed: 0,Beer_style
0,American Pale Ale (APA)
1,Belgian Pale Ale
2,American Adjunct Lager
3,Hefeweizen
4,American Wild Ale


# 8. Extract Final Predictions DataFrame

In [63]:
test_pred = test_pred.rename(columns={0: 'Predictions'})
test_pred.head()

Unnamed: 0,Predictions
0,2
1,12
2,14
3,12
4,83


In [66]:
# Now, merge the test data set with predictions data
df_final = pd.concat([ylist, test_pred, list], axis=1)
df_final.head(20)


Unnamed: 0,Beer_style,Predictions,Predicted_beer_style
0,American Pale Ale (APA),2,American Amber / Red Ale
1,Belgian Pale Ale,12,American IPA
2,American Adjunct Lager,14,American Pale Ale (APA)
3,Hefeweizen,12,American IPA
4,American Wild Ale,83,Oatmeal Stout
5,Munich Dunkel Lager,14,American Pale Ale (APA)
6,American Strong Ale,12,American IPA
7,Hefeweizen,1,American Adjunct Lager
8,Bière de Garde,19,American Strong Ale
9,Weizenbock,9,American Double / Imperial IPA


In [None]:
# save the predictions file
df_final.to_csv('jasleen_mlr_test_predictions', index=False)