# BEER TYPE PREDICTION - ONLY API PREDICTORS USED
## Split datasets first then transform - with PIPELINES

# 1. Load Packages

In [127]:
# Load the packages needed for Logistic Regression Modelling upfront
import pandas as pd
import numpy as np

import category_encoders as ce

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

from joblib import dump

In [1]:
import sklearn
print(sklearn.__version__)

1.0.2


# 2. Load the data

In [128]:
# Load the training data set
data = pd.read_csv('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data/raw/beer_reviews.csv')

# 3. Explore the Data

In [115]:
# check rows and columns of the training set
data.shape

(1586614, 13)

In [116]:
data.head()

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883


In [117]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586614 entries, 0 to 1586613
Data columns (total 13 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   brewery_id          1586614 non-null  int64  
 1   brewery_name        1586599 non-null  object 
 2   review_time         1586614 non-null  int64  
 3   review_overall      1586614 non-null  float64
 4   review_aroma        1586614 non-null  float64
 5   review_appearance   1586614 non-null  float64
 6   review_profilename  1586266 non-null  object 
 7   beer_style          1586614 non-null  object 
 8   review_palate       1586614 non-null  float64
 9   review_taste        1586614 non-null  float64
 10  beer_name           1586614 non-null  object 
 11  beer_abv            1518829 non-null  float64
 12  beer_beerid         1586614 non-null  int64  
dtypes: float64(6), int64(3), object(4)
memory usage: 157.4+ MB


In [118]:
data.columns

Index(['brewery_id', 'brewery_name', 'review_time', 'review_overall',
       'review_aroma', 'review_appearance', 'review_profilename', 'beer_style',
       'review_palate', 'review_taste', 'beer_name', 'beer_abv',
       'beer_beerid'],
      dtype='object')

In [119]:
data.describe()

Unnamed: 0,brewery_id,review_time,review_overall,review_aroma,review_appearance,review_palate,review_taste,beer_abv,beer_beerid
count,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1518829.0,1586614.0
mean,3130.099,1224089000.0,3.815581,3.735636,3.841642,3.743701,3.79286,7.042387,21712.79
std,5578.104,76544270.0,0.7206219,0.6976167,0.6160928,0.6822184,0.7319696,2.322526,21818.34
min,1.0,840672000.0,0.0,1.0,0.0,1.0,1.0,0.01,3.0
25%,143.0,1173224000.0,3.5,3.5,3.5,3.5,3.5,5.2,1717.0
50%,429.0,1239203000.0,4.0,4.0,4.0,4.0,4.0,6.5,13906.0
75%,2372.0,1288568000.0,4.5,4.0,4.0,4.0,4.5,8.5,39441.0
max,28003.0,1326285000.0,5.0,5.0,5.0,5.0,5.0,57.7,77317.0


In [120]:
# count the beer types in the data set
data['beer_style'].nunique()

104

In [121]:
# check the spread of observations
data['beer_style'].value_counts()

American IPA                        117586
American Double / Imperial IPA       85977
American Pale Ale (APA)              63469
Russian Imperial Stout               54129
American Double / Imperial Stout     50705
                                     ...  
Gose                                   686
Faro                                   609
Roggenbier                             466
Kvass                                  297
Happoshu                               241
Name: beer_style, Length: 104, dtype: int64

In [122]:
# check for missing/null values in the dataset
data.isnull().sum()

brewery_id                0
brewery_name             15
review_time               0
review_overall            0
review_aroma              0
review_appearance         0
review_profilename      348
beer_style                0
review_palate             0
review_taste              0
beer_name                 0
beer_abv              67785
beer_beerid               0
dtype: int64

# 4. Transform the Data

## 4.1 Create a copy of the raw dataset

In [129]:
# create a copy of dataset for transformations
df = data.copy()

#df.head()

## 4.2 Drop all rows with missing values

In [136]:
# check for missing values in the dataset
df.isnull().sum()

brewery_id            0
brewery_name          0
review_time           0
review_overall        0
review_aroma          0
review_appearance     0
review_profilename    0
beer_style            0
review_palate         0
review_taste          0
beer_name             0
beer_abv              0
beer_beerid           0
dtype: int64

In [132]:
# Drop rows with any empty cells
df = df.dropna(
    axis=0,
    how='any',
    inplace=False
)

In [133]:
df.head()

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883


In [137]:
df.shape
# dropped 68,136 rows

(1586614, 13)

In [135]:
data.shape


(1586614, 13)

In [None]:
# confirm it worked
df.isnull().sum()

## 4.3 Drop the 2 Identifier columns and save in the interim folder

In [138]:
df.drop(columns=['brewery_id', 'beer_beerid'], axis = 1, inplace=True)

df.head()

Unnamed: 0,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv
0,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0
1,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2
2,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5
3,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0
4,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7


In [7]:
df.to_csv('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data/interim/1_beer_reviews_noIDcolumns.csv', index=False)

## 4.4 Drop columns that have no prediction impact like reviewer details and 'beer_name'

In [139]:
drop_col = ['review_time', 'review_overall', 'review_profilename', 'beer_name']

df.drop(drop_col, axis = 1, inplace=True)

df.head()

Unnamed: 0,brewery_name,review_aroma,review_appearance,beer_style,review_palate,review_taste,beer_abv
0,Vecchio Birraio,2.0,2.5,Hefeweizen,1.5,1.5,5.0
1,Vecchio Birraio,2.5,3.0,English Strong Ale,3.0,3.0,6.2
2,Vecchio Birraio,2.5,3.0,Foreign / Export Stout,3.0,3.0,6.5
3,Vecchio Birraio,3.0,3.5,German Pilsener,2.5,3.0,5.0
4,Caldera Brewing Company,4.5,4.0,American Double / Imperial IPA,4.0,4.5,7.7


In [130]:
df.to_csv('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data/interim/2_beer_reviews_APIpredictorsONLY.csv', index=False)

## 4.5 REMOVE SPECIAL CHARACTERS FROM NAME COLUMN

In [27]:
# this should remove everything but keep whitespace
# https://stackoverflow.com/questions/42676162/remove-symbols-from-string-but-keep-whitespaces
# "[^a-zA-Z\s]+" allows to recognise and keep alphabets and whitespace, hence removing everything else. 
# Similarly "[^a-zA-Z0-9\s]+" would also keep digits and remove all other characters/symbols

df['brewery_name'] = df['brewery_name'].str.replace("[^a-zA-Z\s]+", '', regex=True)

In [320]:
df.to_csv('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data/interim/4_beer_reviews_APIpredictors_NoChar.csv', index=False)

## 4.6 Split up the Datasets

In [140]:
df_cleaned = df.copy()

In [141]:
# extract the target variable out as y = target
target = pd.DataFrame(df_cleaned.pop('beer_style'))
target.shape

(1518478, 1)

In [142]:
target.head()

Unnamed: 0,beer_style
0,Hefeweizen
1,English Strong Ale
2,Foreign / Export Stout
3,German Pilsener
4,American Double / Imperial IPA


In [143]:
X_train, X_test, y_train, y_test = train_test_split(df_cleaned, target, test_size=0.3, random_state = 42)

In [106]:
X_train.shape

(1110629, 6)

In [20]:
X_test.shape

(475980, 6)

In [21]:
y_train.shape

(1110619, 1)

In [22]:
y_test.shape

(475980, 1)

In [108]:
X_train.head()

Unnamed: 0,brewery_name,review_aroma,review_appearance,review_palate,review_taste,beer_abv
1173573,Dogfish Head Brewery,4.5,4.5,4.5,5.0,9.0
417229,Great Lakes Brewing Company,4.5,4.5,4.5,5.0,7.5
1494477,Bitburger Brauerei,3.5,3.5,3.5,4.0,4.8
41757,Bryggeriet Refsvindinge,3.5,3.5,3.5,3.5,5.7
700935,Big Boss Brewing,4.0,4.0,4.0,4.5,8.0


## 4.7 Build Pipelines for SC and OHE

In [144]:
# https://stackoverflow.com/questions/50473381/scikit-learns-labelbinarizer-vs-onehotencoder

# integer encode the target variable
label_encoder = ce.OrdinalEncoder()
y_train = label_encoder.fit_transform(y_train)
y_train

Unnamed: 0,beer_style
115503,1
398865,2
564379,3
1162176,4
1452677,5
...,...
268706,9
1479074,26
136336,49
698294,15


In [145]:
from sklearn.pipeline import Pipeline

In [146]:
num_transformer = Pipeline(
    steps=[
        ('scaler', StandardScaler())
    ]
)

In [147]:
cat_transformer = Pipeline(
    steps=[
        ('one_hot_encoder', ce.OrdinalEncoder())
    ]
)

In [148]:
# Create a list of numeric and categorical columns

num_cols = ['review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv']
cat_cols = ['brewery_name']

In [149]:
from sklearn.compose import ColumnTransformer

In [150]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num_cols', num_transformer, num_cols),
        ('cat_cols', cat_transformer, cat_cols)
    ]
)

In [151]:
model = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('mlr', LogisticRegression(multi_class="multinomial", solver='lbfgs'))
    ]
)

# 5. Train the Model

In [152]:
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num_cols',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['review_aroma',
                                                   'review_appearance',
                                                   'review_palate',
                                                   'review_taste',
                                                   'beer_abv']),
                                                 ('cat_cols',
                                                  Pipeline(steps=[('one_hot_encoder',
                                                                   OrdinalEncoder())]),
                                                  ['brewery_name'])])),
                ('mlr', LogisticRegression(multi_class='multinomial'))])

In [153]:
model.predict(X_train)

array([37, 13, 37, ...,  6, 13,  6])

In [154]:
obs = pd.DataFrame(X_train.iloc[0]).transpose()
model.predict(obs)

array([37])

In [155]:
accuracy_score(y_train, model.predict(X_train))

0.11618219005131081

In [156]:
test_pred = model.predict(X_test)
test_pred

array([ 6,  6,  6, ...,  6, 13,  6])

In [157]:
dump(model, '/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/models/mlr_scaled_6Pred_BothOrdinal_Pipeline.joblib')

dump(model, '/Users/jasle1/Desktop/MDSI/ADSI/AT_2/api/models/mlr_scaled_6Pred_BothOrdinal_Pipeline.joblib')

['/Users/jasle1/Desktop/MDSI/ADSI/AT_2/api/models/mlr_scaled_6Pred_BothOrdinal_Pipeline.joblib']

## 4.__ Standard Scaler the X_train and Y_train datasets

In [25]:
X_train.columns

Index(['brewery_name', 'review_aroma', 'review_appearance', 'review_palate',
       'review_taste', 'beer_abv'],
      dtype='object')

In [327]:
# Scale the numeric columns in the training dataset
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.fit_transform(X_test[num_cols])

In [328]:
X_train.head()

Unnamed: 0,brewery_name,review_aroma,review_appearance,beer_style,review_palate,review_taste,beer_abv
0,Vecchio Birraio,-2.487964,-2.177682,Hefeweizen,-3.288863,-3.132476,-0.649385
1,Vecchio Birraio,-1.771235,-1.366111,English Strong Ale,-1.090136,-1.083199,-0.201924
2,Vecchio Birraio,-1.771235,-1.366111,Foreign / Export Stout,-1.090136,-1.083199,-0.090059
3,Vecchio Birraio,-1.054506,-0.554541,German Pilsener,-1.823045,-1.083199,-0.649385
4,Caldera Brewing Company,1.09568,0.257029,American Double / Imperial IPA,0.375682,0.966078,0.357402


## 4.8 One Hot Encode the Categorical Variable (except target variable)

In [329]:
cat_cols = ['brewery_name'] #, 'beer_style']

#Create object for binary encoding
encoder= ce.BinaryEncoder(cols=cat_cols,return_df=True)

#Create object for ordinal encoding
#encoder= ce.OrdinalEncoder(cols=cat_cols,return_df=True)

In [330]:
#Fit and Transform Data 
X_train = encoder.fit_transform(X_train) 
X_train

Unnamed: 0,brewery_name_0,brewery_name_1,brewery_name_2,brewery_name_3,brewery_name_4,brewery_name_5,brewery_name_6,brewery_name_7,brewery_name_8,brewery_name_9,brewery_name_10,brewery_name_11,brewery_name_12,review_aroma,review_appearance,beer_style,review_palate,review_taste,beer_abv
0,0,0,0,0,0,0,0,0,0,0,0,0,1,-2.487964,-2.177682,Hefeweizen,-3.288863,-3.132476,-0.649385
1,0,0,0,0,0,0,0,0,0,0,0,0,1,-1.771235,-1.366111,English Strong Ale,-1.090136,-1.083199,-0.201924
2,0,0,0,0,0,0,0,0,0,0,0,0,1,-1.771235,-1.366111,Foreign / Export Stout,-1.090136,-1.083199,-0.090059
3,0,0,0,0,0,0,0,0,0,0,0,0,1,-1.054506,-0.554541,German Pilsener,-1.823045,-1.083199,-0.649385
4,0,0,0,0,0,0,0,0,0,0,0,1,0,1.095680,0.257029,American Double / Imperial IPA,0.375682,0.966078,0.357402
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1586609,1,0,1,1,0,0,1,0,1,1,0,1,1,0.378951,-0.554541,Pumpkin Ale,0.375682,0.282985,-0.574808
1586610,1,0,1,1,0,0,1,0,1,1,0,1,1,1.812409,-2.177682,Pumpkin Ale,-2.555954,0.282985,-0.574808
1586611,1,0,1,1,0,0,1,0,1,1,0,1,1,-0.337777,-1.366111,Pumpkin Ale,-0.357227,0.282985,-0.574808
1586612,1,0,1,1,0,0,1,0,1,1,0,1,1,1.095680,1.068599,Pumpkin Ale,1.108591,0.966078,-0.574808


## 4.9 Now, Label Encode the Target Variable

In [331]:
y_train.columns

Index(['brewery_name_0', 'brewery_name_1', 'brewery_name_2', 'brewery_name_3',
       'brewery_name_4', 'brewery_name_5', 'brewery_name_6', 'brewery_name_7',
       'brewery_name_8', 'brewery_name_9', 'brewery_name_10',
       'brewery_name_11', 'brewery_name_12', 'review_aroma',
       'review_appearance', 'beer_style', 'review_palate', 'review_taste',
       'beer_abv'],
      dtype='object')

In [45]:
# https://stackoverflow.com/questions/50473381/scikit-learns-labelbinarizer-vs-onehotencoder

# integer encode the target variable
label_encoder = ce.OrdinalEncoder()
y_train = label_encoder.fit_transform(y_train)
y_train = pd.DataFrame(y_train)
y_train

Unnamed: 0,beer_style
1539264,1
653707,2
213604,3
1495318,4
894759,5
...,...
259178,80
1414429,23
131932,39
671164,2


## 4.11 Save the processed data files

In [337]:
# save the processed datasets
np.save('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data/processed/X_train', X_train)
np.save('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data/processed/X_test',  X_test)
np.save('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data/processed/y_train', y_train)
np.save('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data/processed/y_test',  y_test)

In [338]:
# save csv files too
X_train.to_csv('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data/processed/X_train.csv', index=False)
X_test.to_csv('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data/processed/X_test.csv', index=False)

In [339]:
y_train.to_csv('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data/processed/y_train.csv', index=False)
y_test.to_csv('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data/processed/y_test.csv', index=False)

# 5. Train a Multinomial Logistic Regression Model

In [340]:
# https://machinelearningmastery.com/multinomial-logistic-regression-with-python/
# define the multinomial logistic regression model with a penalty
model = LogisticRegression(multi_class="multinomial", solver='lbfgs') #, max_iter=1000, penalty='l2', C=1.0) #, class_weight='balanced')

# https://stackoverflow.com/questions/62658215/convergencewarning-lbfgs-failed-to-converge-status-1-stop-total-no-of-iter
# Convergence warning below in 'model.fit' can be resolved by adding 'max_iter=1000' here, as above

# https://stackoverflow.com/questions/61814494/what-is-this-warning-convergencewarning-lbfgs-failed-to-converge-status-1
# default max_iter = 100

In [32]:
# https://stackoverflow.com/questions/58313842/a-column-vector-y-was-passed-when-a-1d-array-was-expected-error-message
# Model was throwing an array warning so added 'np.ravel()' to the target dataset --> 
model.fit(X_train, np.ravel(y_train))
#model.fit(X_train, y_train)

# ran for over 9min with default max_iter=100 and no class_weight
# ran for 60min with 1000 iterations and balanced class_weight, but the model performance decreased.
# should have just tried 300 iters --> ran for 33min, but wasn't enough, accuracy same as 100 iterations

In [342]:
# *** save the models for API use
dump(model, '/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/models/mlr_scaled_6Pred_NoChar_Pipeline.joblib')

dump(model, '/Users/jasle1/Desktop/MDSI/ADSI/AT_2/api/models/mlr_scaled_6Pred_NoChar_Pipeline.joblib')

['/Users/jasle1/Desktop/MDSI/ADSI/AT_2/api/models/mlr_scaled_6Pred_NoChar.joblib']

# 6. Evaluate Model Performance

In [343]:
# https://dataaspirant.com/implement-multinomial-logistic-regression-python/
print ("Multinomial Logistic regression Train Accuracy :: ", accuracy_score(y_train, model.predict(X_train)))
print ("Multinomial Logistic regression Test Accuracy :: ", accuracy_score(y_test, model.predict(X_test)))

# accuracy decreased from 25% to 18% when 4 additional parameters were added to the regressor
# when brewery name and target variables were both label encoded, and only 6 API predictors were kept, train and test accuracy dropped from 25% to 7%

Multinomial Logistic regression Train Accuracy ::  0.17969708784020444
Multinomial Logistic regression Test Accuracy ::  0.1807638976427581


In [344]:
obs = pd.DataFrame(X_train.iloc[315]).transpose()
model.predict(obs)

array([89])

In [345]:
# Make predictions on the test dataset
test_prob = model.predict_proba(X_test)
test_prob

array([[7.42472645e-03, 4.43467315e-02, 3.40337400e-02, ...,
        8.99146522e-05, 6.91327183e-03, 1.88941501e-02],
       [7.82944357e-04, 3.85892568e-05, 6.89901846e-03, ...,
        1.59600972e-03, 9.75646331e-03, 2.63325735e-03],
       [4.07853547e-03, 2.17308239e-04, 1.72168082e-02, ...,
        1.08194172e-03, 1.51232607e-02, 2.53039288e-03],
       ...,
       [6.75667447e-03, 5.87051220e-03, 4.85066350e-02, ...,
        6.01103400e-05, 5.09730620e-03, 5.21296935e-02],
       [1.26374214e-03, 8.42372889e-03, 7.78741277e-02, ...,
        1.94462911e-05, 8.05403481e-03, 2.21443524e-02],
       [1.09994441e-02, 1.42779345e-03, 4.89550758e-02, ...,
        1.60815694e-04, 2.07220724e-02, 2.95796701e-02]])

In [346]:
# Calculate auc scores for performance evaluation
# AUROC score is one of the preferred metrics for multi-class classification models evaluation
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html#sklearn.metrics.roc_auc_score
auc_score = roc_auc_score(y_test, test_prob, multi_class='ovr')

print(f'The Multinomial Logistic regression ROC AUC score is {auc_score}')

The Multinomial Logistic regression ROC AUC score is 0.8328437642963085


# 7. Transform the Target Variable and its Predictions back to String/Name

In [None]:
# Transform Predictions back to names
# https://scikit-learn.org/stable/modules/preprocessing_targets.html
list = label_encoder.inverse_transform(test_pred)
list = pd.DataFrame(list)
list = list.rename(columns={0: 'Predicted_beer_style'})
list.head()

Unnamed: 0,Predicted_beer_style
0,American IPA
1,American IPA
2,American IPA
3,American IPA
4,American IPA


In [None]:
# Transform y_test back to names
ylist = label_encoder.inverse_transform(y_test)
ylist = pd.DataFrame(ylist)
ylist = ylist.rename(columns={0: 'Beer_style'})
ylist.head()

  y = column_or_1d(y, warn=True)


Unnamed: 0,Beer_style
0,American IPA
1,American Double / Imperial IPA
2,American Pale Ale (APA)
3,Extra Special / Strong Bitter (ESB)
4,American Porter


# 8. Extract Final Predictions DataFrame

In [None]:
test_pred = pd.DataFrame(test_pred)
test_pred = test_pred.rename(columns={0: 'Predictions'})
test_pred.head()

Unnamed: 0,Predictions
0,12
1,12
2,12
3,12
4,12


In [None]:
X_test = pd.DataFrame(X_test)

In [None]:
# Now, merge the test data set with predictions data
df_final = pd.concat([ylist, test_pred, list], axis=1)
df_final.head(20)


Unnamed: 0,Beer_style,Predictions,Predicted_beer_style
0,American IPA,12,American IPA
1,American Double / Imperial IPA,12,American IPA
2,American Pale Ale (APA),12,American IPA
3,Extra Special / Strong Bitter (ESB),12,American IPA
4,American Porter,12,American IPA
5,Extra Special / Strong Bitter (ESB),12,American IPA
6,English Brown Ale,12,American IPA
7,American IPA,12,American IPA
8,American IPA,12,American IPA
9,English India Pale Ale (IPA),12,American IPA


In [None]:
# save the predictions file
df_final.to_csv('jasleen_mlr_test_predictions', index=False)

# 9. Test the App

#### If Docker is Running,  http://localhost:8080/docs

https://adsi-at2.herokuapp.com/docs

In [None]:
# either enter these values manually
brewery_name		            review_aroma	review_appearance	review_palate	review_taste	beer_abv       beer_style
Vecchio Birraio		                2	              2.5		            1.5	           1.5	        5	       Hefeweizen
Pacific Coast Brewing Company	    4	               4	                  4	            4	        10	       American Double / Imperial Stout
Caldera Brewing Company		     	4	               4		              4	            4	         7         American Strong Ale

https://adsi-at2.herokuapp.com/beer/type/prediction?name=Vecchio%20Birraio&aroma=2&appearance=2.5&palate=1.5&taste=1.5&volume=5

https://adsi-at2.herokuapp.com/beer/type/prediction?name=Caldera%20Brewing%20Company%09&aroma=4&appearance=4&palate=4&taste=4&volume=7

https://adsi-at2.herokuapp.com/beer/type/prediction?name=Pacific%20Coast%20Brewing%20Company&aroma=4&appearance=4&palate=4&taste=4&volume=10