# BEER TYPE PREDICTION - ALL Variables

# 1. Load Packages

In [2]:
# Load the packages needed for Logistic Regression Modelling upfront
import pandas as pd
import numpy as np
#from chart_studio import plotly as plt
#import seaborn as sns
import category_encoders as ce
from numpy import array

#from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
#from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

#from sklearn.metrics import mean_squared_error as mse
#from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

from joblib import dump

# 2. Load the data

In [3]:
# Load the training data set
data = pd.read_csv('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data/raw/beer_reviews.csv')

# 3. Explore the Data

In [32]:
# check rows and columns of the training set
data.shape

(1586614, 13)

In [33]:
data.head()

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883


In [34]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586614 entries, 0 to 1586613
Data columns (total 13 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   brewery_id          1586614 non-null  int64  
 1   brewery_name        1586599 non-null  object 
 2   review_time         1586614 non-null  int64  
 3   review_overall      1586614 non-null  float64
 4   review_aroma        1586614 non-null  float64
 5   review_appearance   1586614 non-null  float64
 6   review_profilename  1586266 non-null  object 
 7   beer_style          1586614 non-null  object 
 8   review_palate       1586614 non-null  float64
 9   review_taste        1586614 non-null  float64
 10  beer_name           1586614 non-null  object 
 11  beer_abv            1518829 non-null  float64
 12  beer_beerid         1586614 non-null  int64  
dtypes: float64(6), int64(3), object(4)
memory usage: 157.4+ MB


In [35]:
data.columns

Index(['brewery_id', 'brewery_name', 'review_time', 'review_overall',
       'review_aroma', 'review_appearance', 'review_profilename', 'beer_style',
       'review_palate', 'review_taste', 'beer_name', 'beer_abv',
       'beer_beerid'],
      dtype='object')

In [36]:
data.describe()

Unnamed: 0,brewery_id,review_time,review_overall,review_aroma,review_appearance,review_palate,review_taste,beer_abv,beer_beerid
count,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1518829.0,1586614.0
mean,3130.099,1224089000.0,3.815581,3.735636,3.841642,3.743701,3.79286,7.042387,21712.79
std,5578.104,76544270.0,0.7206219,0.6976167,0.6160928,0.6822184,0.7319696,2.322526,21818.34
min,1.0,840672000.0,0.0,1.0,0.0,1.0,1.0,0.01,3.0
25%,143.0,1173224000.0,3.5,3.5,3.5,3.5,3.5,5.2,1717.0
50%,429.0,1239203000.0,4.0,4.0,4.0,4.0,4.0,6.5,13906.0
75%,2372.0,1288568000.0,4.5,4.0,4.0,4.0,4.5,8.5,39441.0
max,28003.0,1326285000.0,5.0,5.0,5.0,5.0,5.0,57.7,77317.0


In [37]:
# count the beer types in the data set
data['beer_style'].nunique()

104

In [38]:
# check the spread of observations
data['beer_style'].value_counts()

American IPA                        117586
American Double / Imperial IPA       85977
American Pale Ale (APA)              63469
Russian Imperial Stout               54129
American Double / Imperial Stout     50705
                                     ...  
Gose                                   686
Faro                                   609
Roggenbier                             466
Kvass                                  297
Happoshu                               241
Name: beer_style, Length: 104, dtype: int64

# 4. Transform the Data

## 4.1 Create a copy of the raw dataset

In [9]:
# create a copy of dataset for transformations
df = data.copy()

df.head()

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883


## 4.2 Drop the 2 Identifier columns and save in the interim folder

In [11]:
df.drop('brewery_id', axis=1, inplace=True)

In [12]:
df.drop('beer_beerid', axis = 1, inplace=True)

In [13]:
df.head()

Unnamed: 0,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv
0,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0
1,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2
2,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5
3,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0
4,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7


In [14]:
df.to_csv('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data/interim/beer_reviews_noIDcolumns.csv', index=False)

## Drop columns that have no prediction impact like reviewer details and 'beer_name'

## 2. Convert float columns to Int that are actually numeric

In [15]:
# Change float columns to int
# https://stackoverflow.com/questions/21291259/convert-floats-to-ints-in-pandas
df = df.astype({
    'review_overall': 'int', 
    'review_aroma': 'int', 
    'review_appearance': 'int',
    'review_palate': 'int',
    'review_taste': 'int'
    })

In [16]:
# this float column has null values so they need to be dealt with as well
# https://stackoverflow.com/questions/21291259/convert-floats-to-ints-in-pandas

df[("beer_abv")] = df[("beer_abv")].fillna(0.0).astype(int)

In [17]:
# Check if it worked
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586614 entries, 0 to 1586613
Data columns (total 11 columns):
 #   Column              Non-Null Count    Dtype 
---  ------              --------------    ----- 
 0   brewery_name        1586599 non-null  object
 1   review_time         1586614 non-null  int64 
 2   review_overall      1586614 non-null  int64 
 3   review_aroma        1586614 non-null  int64 
 4   review_appearance   1586614 non-null  int64 
 5   review_profilename  1586266 non-null  object
 6   beer_style          1586614 non-null  object
 7   review_palate       1586614 non-null  int64 
 8   review_taste        1586614 non-null  int64 
 9   beer_name           1586614 non-null  object
 10  beer_abv            1586614 non-null  int64 
dtypes: int64(7), object(4)
memory usage: 133.2+ MB


In [18]:
df.describe()

Unnamed: 0,review_time,review_overall,review_aroma,review_appearance,review_palate,review_taste,beer_abv
count,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0
mean,1224089000.0,3.59571,3.51011,3.636085,3.533955,3.558914,6.417812
std,76544270.0,0.7539576,0.7357839,0.6569639,0.7228691,0.7643163,2.699519
min,840672000.0,0.0,1.0,0.0,1.0,1.0,0.0
25%,1173224000.0,3.0,3.0,3.0,3.0,3.0,5.0
50%,1239203000.0,4.0,4.0,4.0,4.0,4.0,6.0
75%,1288568000.0,4.0,4.0,4.0,4.0,4.0,8.0
max,1326285000.0,5.0,5.0,5.0,5.0,5.0,57.0


## 3. Standardise the Numeric columns in the dataset

In [19]:
df.columns

Index(['brewery_name', 'review_time', 'review_overall', 'review_aroma',
       'review_appearance', 'review_profilename', 'beer_style',
       'review_palate', 'review_taste', 'beer_name', 'beer_abv'],
      dtype='object')

In [20]:
# Create a list of numeric columns

num_cols = ['review_time', 'review_overall',
       'review_aroma', 'review_appearance',
       'review_palate', 'review_taste', 'beer_abv']

In [21]:
# Scale the numeric columns in the training dataset
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

In [22]:
df.head()

Unnamed: 0,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv
0,Vecchio Birraio,0.140161,-3.442781,-2.052383,-2.490374,stcules,Hefeweizen,-3.505414,-3.347979,Sausa Weizen,-0.525209
1,Vecchio Birraio,0.154496,-0.790111,-2.052383,-0.96822,stcules,English Strong Ale,-0.738661,-0.731261,Red Moon,-0.154773
2,Vecchio Birraio,0.154516,-0.790111,-2.052383,-0.96822,stcules,Foreign / Export Stout,-0.738661,-0.731261,Black Horse Black Beer,-0.154773
3,Vecchio Birraio,0.138951,-0.790111,-0.693288,-0.96822,stcules,German Pilsener,-2.122037,-0.731261,Sausa Pils,-0.525209
4,Caldera Brewing Company,0.909878,0.536224,0.665807,0.553934,johnmichaelsen,American Double / Imperial IPA,0.644716,0.577099,Cauldron DIPA,0.215664


## 4. One Hot Encode the Categorical Variables (except target variable)

In [24]:
cat_cols = ['brewery_name', 'review_profilename', 'beer_name']

#Create object for binary encoding
encoder= ce.BinaryEncoder(cols=cat_cols,return_df=True)

In [25]:
#Fit and Transform Data 
df_cleaned = encoder.fit_transform(df) 
df_cleaned

Unnamed: 0,brewery_name_0,brewery_name_1,brewery_name_2,brewery_name_3,brewery_name_4,brewery_name_5,brewery_name_6,brewery_name_7,brewery_name_8,brewery_name_9,...,beer_name_7,beer_name_8,beer_name_9,beer_name_10,beer_name_11,beer_name_12,beer_name_13,beer_name_14,beer_name_15,beer_abv
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,-0.525209
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,-0.154773
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,-0.154773
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,-0.525209
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0.215664
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1586609,1,0,1,1,0,0,1,1,0,0,...,0,0,0,0,1,1,0,0,1,-0.525209
1586610,1,0,1,1,0,0,1,1,0,0,...,0,0,0,0,1,1,0,0,1,-0.525209
1586611,1,0,1,1,0,0,1,1,0,0,...,0,0,0,0,1,1,0,0,1,-0.525209
1586612,1,0,1,1,0,0,1,1,0,0,...,0,0,0,0,1,1,0,0,1,-0.525209


## 5. Now, Label Encode the Target Variable

In [26]:
df_cleaned.columns

Index(['brewery_name_0', 'brewery_name_1', 'brewery_name_2', 'brewery_name_3',
       'brewery_name_4', 'brewery_name_5', 'brewery_name_6', 'brewery_name_7',
       'brewery_name_8', 'brewery_name_9', 'brewery_name_10',
       'brewery_name_11', 'brewery_name_12', 'review_time', 'review_overall',
       'review_aroma', 'review_appearance', 'review_profilename_0',
       'review_profilename_1', 'review_profilename_2', 'review_profilename_3',
       'review_profilename_4', 'review_profilename_5', 'review_profilename_6',
       'review_profilename_7', 'review_profilename_8', 'review_profilename_9',
       'review_profilename_10', 'review_profilename_11',
       'review_profilename_12', 'review_profilename_13',
       'review_profilename_14', 'review_profilename_15', 'beer_style',
       'review_palate', 'review_taste', 'beer_name_0', 'beer_name_1',
       'beer_name_2', 'beer_name_3', 'beer_name_4', 'beer_name_5',
       'beer_name_6', 'beer_name_7', 'beer_name_8', 'beer_name_9',
       '

In [27]:
# https://stackoverflow.com/questions/50473381/scikit-learns-labelbinarizer-vs-onehotencoder

# integer encode
label_encoder = LabelEncoder()
df_cleaned['beer_style'] = label_encoder.fit_transform(df_cleaned['beer_style'])
print(df_cleaned['beer_style'])

0          65
1          51
2          59
3          61
4           9
           ..
1586609    85
1586610    85
1586611    85
1586612    85
1586613    85
Name: beer_style, Length: 1586614, dtype: int64


In [28]:
df_cleaned.head()

Unnamed: 0,brewery_name_0,brewery_name_1,brewery_name_2,brewery_name_3,brewery_name_4,brewery_name_5,brewery_name_6,brewery_name_7,brewery_name_8,brewery_name_9,...,beer_name_7,beer_name_8,beer_name_9,beer_name_10,beer_name_11,beer_name_12,beer_name_13,beer_name_14,beer_name_15,beer_abv
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,-0.525209
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,-0.154773
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,-0.154773
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,-0.525209
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0.215664


## 6. Split up the Datasets

In [29]:
# extract the target variable out as y = target
target = pd.DataFrame(df_cleaned.pop('beer_style'))
target.shape

(1586614, 1)

In [30]:
target.head()

Unnamed: 0,beer_style
0,65
1,51
2,59
3,61
4,9


In [31]:
X_train, X_test, y_train, y_test = train_test_split(df_cleaned, target, test_size=0.3, random_state = 42)

In [57]:
X_train.shape

(1110629, 54)

In [58]:
X_test.shape

(475985, 54)

In [59]:
y_train.shape

(1110629, 1)

In [60]:
y_test.shape

(475985, 1)

## 7. Save the processed data files

In [32]:
# save the processed datasets
np.save('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data/processed/X_train', X_train)
np.save('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data/processed/X_test',  X_test)
np.save('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data/processed/y_train', y_train)
np.save('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data/processed/y_test',  y_test)

In [33]:
# save csv files too
X_train.to_csv('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data/processed/X_train.csv', index=False)
X_test.to_csv('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data/processed/X_test.csv', index=False)

In [34]:
y_train.to_csv('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data/processed/y_train.csv', index=False)
y_test.to_csv('/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/data/processed/y_test.csv', index=False)

## 8. Reformat the Target Variable datasets

In [64]:
# this will make it easier to pass target variable through the model
# https://stackoverflow.com/questions/50473381/scikit-learns-labelbinarizer-vs-onehotencoder
y_train = np.array(y_train).ravel()

In [104]:
y_test = np.array(y_test).ravel()
y_test

array([ 4, 35, 47, ..., 60, 45, 85])

# DO NOT RUN
# Run Grid Search - getting same results for shorter runtime so skip this

In [112]:
# if the solver is not specified, lr defaults to solver='lbfgs' and throws an error
mlr = LogisticRegression(multi_class="multinomial", solver='lbfgs') #,class_weight='balanced')

mlr_params = {
    'C': [.001, .01, 0.1, 0.5, 1, 5, 10],
    'penalty': ['l1', 'l2']
}

model = GridSearchCV(mlr, param_grid=mlr_params, cv=3, verbose = 1)

In [113]:
model.fit(X_train, y_train)
# ran for 90 min

Fitting 3 folds for each of 14 candidates, totalling 42 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

GridSearchCV(cv=3, estimator=LogisticRegression(multi_class='multinomial'),
             param_grid={'C': [0.001, 0.01, 0.1, 0.5, 1, 5, 10],
                         'penalty': ['l1', 'l2']},
             verbose=1)

In [115]:
print(f'scaled')
print(f'CrossVal Score: {model.best_score_}')
print(f'Training Score: {model.score(X_train, y_train)}')
print(f'test Score: {model.score(X_test, y_test)}')
print(model.best_params_)

scaled
CrossVal Score: 0.25407044123698586
Training Score: 0.25415057593489815
test Score: 0.2538903536876162
{'C': 0.5, 'penalty': 'l2'}


# Train a Multinomial Logistic Regression Model

In [35]:
X_train.head()

Unnamed: 0,brewery_name_0,brewery_name_1,brewery_name_2,brewery_name_3,brewery_name_4,brewery_name_5,brewery_name_6,brewery_name_7,brewery_name_8,brewery_name_9,...,beer_name_7,beer_name_8,beer_name_9,beer_name_10,beer_name_11,beer_name_12,beer_name_13,beer_name_14,beer_name_15,beer_abv
1173573,1,0,0,0,0,0,1,0,0,0,...,1,1,1,1,0,0,1,0,1,0.956537
417229,0,0,1,0,1,0,0,0,1,0,...,0,1,1,0,1,1,0,0,1,0.215664
1494477,1,0,1,0,1,0,0,1,1,1,...,0,1,0,1,1,0,1,0,0,-0.895646
41757,0,0,0,0,0,1,1,0,1,0,...,0,0,1,0,0,1,0,1,1,-0.525209
700935,0,1,0,0,1,0,1,1,0,1,...,0,0,1,0,0,0,1,1,1,0.5861


In [36]:
print(y_train)

         beer_style
1173573           9
417229          102
1494477          61
41757            50
700935           25
...             ...
259178           13
1414414          49
131932           89
671155            2
121958           44

[1110629 rows x 1 columns]


In [37]:
# https://machinelearningmastery.com/multinomial-logistic-regression-with-python/
# define the multinomial logistic regression model with a penalty
model = LogisticRegression(multi_class="multinomial", solver='lbfgs', max_iter=100, penalty='l2', C=1.0) #, class_weight='balanced')

# https://stackoverflow.com/questions/62658215/convergencewarning-lbfgs-failed-to-converge-status-1-stop-total-no-of-iter
# Convergence warning below in 'model.fit' can be resolved by adding 'max_iter>100' here, as above

# https://stackoverflow.com/questions/61814494/what-is-this-warning-convergencewarning-lbfgs-failed-to-converge-status-1
# default max_iter = 100

In [38]:
# https://stackoverflow.com/questions/58313842/a-column-vector-y-was-passed-when-a-1d-array-was-expected-error-message
# Model was throwing a array warning so added 'np.ravel()' to the target dataset
model.fit(X_train, y_train)

# ran for over 10min with default max_iter=100 and no class_weight
# ran for 60min with 1000 iterations and balanced class_weight
# should have just tried 300 iters --> ran for 33min, but wasn't enough..

  y = column_or_1d(y, warn=True)


In [105]:
# Make predictions on the test dataset
test_pred = model.predict_proba(X_test)
test_pred

array([[3.01274807e-11, 3.15329283e-13, 7.28184671e-10, ...,
        7.98549740e-03, 1.17839012e-08, 5.44036728e-11],
       [1.86044397e-03, 5.31967708e-03, 2.03047002e-02, ...,
        5.57520141e-05, 1.80483620e-02, 1.42179381e-02],
       [3.35181594e-02, 3.92123721e-03, 3.92310877e-02, ...,
        7.35317376e-07, 2.64162896e-02, 1.74250516e-02],
       ...,
       [9.83324542e-04, 2.88604217e-04, 6.22734951e-03, ...,
        1.53775982e-05, 8.73679271e-03, 9.19605173e-04],
       [5.80935770e-03, 2.48869737e-03, 6.94655640e-02, ...,
        1.40329825e-07, 1.50137544e-02, 6.81565580e-02],
       [1.33512479e-03, 6.76439200e-05, 1.36810154e-02, ...,
        8.09292622e-05, 9.63843270e-03, 2.38736108e-02]])

# Evaluate Model Performance

In [None]:
# https://dataaspirant.com/implement-multinomial-logistic-regression-python/
print ("Multinomial Logistic regression Train Accuracy :: ", accuracy_score(y_train, model.predict(X_train)))
print ("Multinomial Logistic regression Test Accuracy :: ", accuracy_score(y_test, model.predict(X_test)))

# accuracy decreased from 25% to 18% when 4 additional parameters were added to the regressor

Multinomial Logistic regression Train Accuracy ::  0.25544893929476
Multinomial Logistic regression Test Accuracy ::  0.2551929157431432


In [94]:
y_true = np.argmax(y_test, axis=0)
y_true

54

In [None]:
obs = pd.DataFrame(X_train.iloc[0]).transpose()

In [None]:
model.predict(obs)

In [109]:
# Calculate auc scores for performance evaluation
# AUROC score is one of the preferred metrics for multi-class classification models evaluation
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html#sklearn.metrics.roc_auc_score
auc_score = roc_auc_score(y_test, test_pred, multi_class='ovr')

print(f'The Multinomial Logistic regression ROC AUC score is {auc_score}')

The Multinomial Logistic regression ROC AUC score is 0.8673030506928795


`First round Accuracy is 0.25 while AUROC is 0.86 -->`

`this suggests that the model is predciting positive classes correctly but not the negative classes(?)`

` it could be because `

`    1. the data wasn't balanced - tried to use 'class_weight' but it decreased accuracy to 18%`

`    2. the model isn't the right type `

`    3. flipping is required which would take accuracy to 75% `

`    4. Increasingly the max_iterations did not have any positive impact on the model accuracy`

`https://stackoverflow.com/questions/38387913/reason-of-having-high-auc-and-low-accuracy-in-a-balanced-dataset`

In [78]:
# *** save scaler in the models folder called "scaler.joblib" for use next time
dump(model, '/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/models/mlr_scaled_300iters.joblib')

['/Users/jasle1/Desktop/MDSI/ADSI/AT_2/adsi_at2/models/mlr_scaled_300iters.joblib']

# 7. Test the App

#### If Docker is Running,  http://localhost:8080/docs

#### otherwise, https://adsi-at2.herokuapp.com/docs

In [None]:
# either enter these values manually
brewery_name		            review_aroma	review_appearance	review_palate	review_taste	beer_abv       beer_style
Vecchio Birraio		                2	              2.5		            1.5	           1.5	        5	       Hefeweizen
Pacific Coast Brewing Company	    4	               4	                  4	            4	        10	       American Double / Imperial Stout
Caldera Brewing Company		     	4	               4		              4	            4	         7         American Strong Ale

#### OR Launch these links - for multiple predictors

https://adsi-at2.herokuapp.com/beer/type/multiple/predictors?name=Vecchio%20Birraio&aroma=2&appearance=2.5&palate=1.5&taste=1.5&volume=5

https://adsi-at2.herokuapp.com/beer/type/multiple/predictors?name=Pacific%20Coast%20Brewing%20Company&aroma=4&appearance=4&palate=4&taste=4&volume=10

https://adsi-at2.herokuapp.com/beer/type/multiple/predictors?name=Caldera%20Brewing%20Company&aroma=4&appearance=4&palate=4&taste=4&volume=7