In [1]:
import pandas as pd
import numpy as np

import os
import sys
sys.path.append(os.path.abspath('../'))

from src.data.utility import DataReader, BeerData

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

%load_ext autoreload
%autoreload 2

  import pandas.util.testing as tm


In [2]:
# Instantiate the custom data reader class
data_reader = DataReader()
# Load Raw Train Data
train_df = data_reader.read_data(BeerData.RAW)

### 1. View Original Data

In [152]:
train_df.info()
print("Raw Data Dimension", train_df.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586614 entries, 0 to 1586613
Data columns (total 13 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   brewery_id          1586614 non-null  int64  
 1   brewery_name        1586599 non-null  object 
 2   review_time         1586614 non-null  int64  
 3   review_overall      1586614 non-null  float64
 4   review_aroma        1586614 non-null  float64
 5   review_appearance   1586614 non-null  float64
 6   review_profilename  1586266 non-null  object 
 7   beer_style          1586614 non-null  object 
 8   review_palate       1586614 non-null  float64
 9   review_taste        1586614 non-null  float64
 10  beer_name           1586614 non-null  object 
 11  beer_abv            1518829 non-null  float64
 12  beer_beerid         1586614 non-null  int64  
dtypes: float64(6), int64(3), object(4)
memory usage: 157.4+ MB
Raw Data Dimension (1586614, 13)


In [3]:
brewey_name_list = pd.DataFrame(train_df['brewery_name'].unique(), columns=['Brewery_Name'])
#brewey_name_list.to_csv("../reports/brewery_name_list.csv")
brewey_name_list.shape

(5743, 1)

### 2. Exmine NULL values and Drop the rows with Null for brewery_name

In [15]:
train_df.isna().sum()

brewery_id                0
brewery_name             15
review_time               0
review_overall            0
review_aroma              0
review_appearance         0
review_profilename      348
beer_style                0
review_palate             0
review_taste              0
beer_name                 0
beer_abv              67785
beer_beerid               0
dtype: int64

In [16]:
### Exmine How many row has missing brewy names
brew_name_null = train_df[train_df['brewery_name'].isnull()]
brew_name_null.shape
## 15 rows missing for brewy-names  at brewery_id 1193 x 9 and 27 X 6

(15, 13)

In [17]:
### Drop the rows if "brewy_name" is NULL
### Select only non-null value of brewry_name
df_cleaned = train_df.copy()
df_cleaned = df_cleaned[df_cleaned['brewery_name'].notnull()]
# df_cleaned.head()

### 3. Select the features - Drop unpromising-looking features

In [18]:
# Follow the commonsense
col_to_drop = ['beer_beerid', 'brewery_id', "review_profilename", "beer_name", "review_time", "review_overall", "beer_abv"]
target_column = 'beer_style'

df_cleaned = df_cleaned.drop(col_to_drop, axis = 1)

#df_cleaned.head()
df_cleaned.isna().sum()

brewery_name         0
review_aroma         0
review_appearance    0
beer_style           0
review_palate        0
review_taste         0
dtype: int64

### 4. Data Pre-processing Pipeline 1 - Label Encoding to Target Column

In [20]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from src.models.pytorch import New_LabelEncoder

pip = Pipeline(steps = [('l_encoder', New_LabelEncoder())])
df_cleaned['beer_style_encoded'] = pip.fit_transform(df_cleaned['beer_style'])

# Save the pipeline
from joblib import dump, load
dump(pip, '../models/pipeline/te_pipeline.sav')

# Drop the beer_style column
beer_style = df_cleaned.pop('beer_style')

df_cleaned['beer_style_encoded'].value_counts().reset_index()
#print(df_cleaned.isna().sum())

Unnamed: 0,index,beer_style_encoded
0,12,117584
1,9,85977
2,14,63469
3,89,54129
4,11,50705
...,...,...
99,62,686
100,56,609
101,88,466
102,72,297


In [50]:
df_cleaned.describe()

Unnamed: 0,brewery_id,review_time,review_overall,review_aroma,review_appearance,review_palate,review_taste,beer_abv,beer_beerid,beer_style_encoded
count,1586599.0,1586599.0,1586599.0,1586599.0,1586599.0,1586599.0,1586599.0,1518814.0,1586599.0,1586599.0
mean,3130.122,1224089000.0,3.815584,3.735638,3.841647,3.743705,3.792864,7.042396,21712.49,42.14239
std,5578.125,76544100.0,0.7206191,0.6976142,0.6160899,0.6822131,0.7319658,2.322532,21818.07,33.02083
min,1.0,840672000.0,0.0,1.0,0.0,1.0,1.0,0.01,3.0,0.0
25%,143.0,1173224000.0,3.5,3.5,3.5,3.5,3.5,5.2,1717.0,12.0
50%,429.0,1239202000.0,4.0,4.0,4.0,4.0,4.0,6.5,13906.0,31.0
75%,2372.0,1288568000.0,4.5,4.0,4.0,4.0,4.5,8.5,39441.0,74.0
max,28003.0,1326285000.0,5.0,5.0,5.0,5.0,5.0,57.7,77317.0,103.0


### 5. Data Processing Pipeline 2:  Perform scaling and categorical encoding

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
import category_encoders as ce

# encoder= ce.BaseNEncoder(cols=['brewery_name'],return_df=True,base=5)
# df_cleaned_encoded=encoder.fit_transform(df_cleaned)
# df_cleaned_encoded

obs  
all_features = ['brewery_name', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste' ]
num_cols = ['review_aroma', 'review_appearance', 'review_palate', 'review_taste']
cat_cols = ['brewery_name']

num_transformer = Pipeline(steps = [('scaler', StandardScaler())])
cat_transformer = Pipeline(steps = [('base_n_encoder', ce.BaseNEncoder(cols=cat_cols,return_df=True,base=5) )])

pre_processor = ColumnTransformer(
    transformers = [  
       ('cat_cols', cat_transformer, cat_cols),
       ('num_cols', num_transformer, num_cols),
])

### Data Pre-processing pipe
dpp_pipe = Pipeline(
    steps =[
        ('pre_processor', pre_processor)
    ]
)

## Apply Pipeline Object
trained_encoder = dpp_pipe.fit(pipeline_test_data[all_features])
processed_data = trained_encoder.transform(pipeline_test_data[all_features])

##### Save the trained encoder to reuse in new data for future prediction
dump(trained_encoder, '../models/pipeline/trained_encoder.sav')

NameError: name 'obs' is not defined

In [52]:
features = ["review_aroma","review_appearance", "review_palate", "review_taste",
"brewery_name_0","brewery_name_1", "brewery_name_2","brewery_name_3", "brewery_name_4", "brewery_name_5","brewery_name_6",]

cleaned_processed_data = pd.DataFrame(data=processed_data, columns=features)
#cleaned_processed_data['beer_style'] = df_cleaned['beer_style_encoded']
cleaned_processed_data.head(5)

Unnamed: 0,review_aroma,review_appearance,review_palate,review_taste,brewery_name_0,brewery_name_1,brewery_name_2,brewery_name_3,brewery_name_4,brewery_name_5,brewery_name_6
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-2.487964,-2.177682,-3.288863,-3.132476
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.771235,-1.366111,-1.090136,-1.083199
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.771235,-1.366111,-1.090136,-1.083199
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.054506,-0.554541,-1.823045,-1.083199
4,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.09568,0.257029,0.375682,0.966078


### 6. Subset the dataset for experiment as the original dataset is huge in size ,1500K observations

In [53]:
from src.data.sets import split_sets_random, save_sets, load_sets, subset_x_y

features = ["review_aroma","review_appearance", "review_palate", "review_taste" ,
            "brewery_name_0","brewery_name_1", "brewery_name_2","brewery_name_3", "brewery_name_4", "brewery_name_5","brewery_name_6" ]

y_main, X_main = subset_x_y(cleaned_processed_data[features], df_cleaned['beer_style_encoded'], 0, 400000)

In [54]:
# Check the size of the Target Class Distribution in the subset Dataset
(unique, counts) = np.unique(y_main, return_counts=True)
frequencies = np.asarray((unique, counts)).T
# print(frequencies)
## Remark - the distribution is same as original

In [55]:
X_main['beer_style'] = df_cleaned['beer_style_encoded']
X_main.head()

Unnamed: 0,review_aroma,review_appearance,review_palate,review_taste,brewery_name_0,brewery_name_1,brewery_name_2,brewery_name_3,brewery_name_4,brewery_name_5,brewery_name_6,beer_style
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-2.487964,-2.177682,-3.288863,-3.132476,65
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.771235,-1.366111,-1.090136,-1.083199,51
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.771235,-1.366111,-1.090136,-1.083199,59
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.054506,-0.554541,-1.823045,-1.083199,61
4,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.09568,0.257029,0.375682,0.966078,9


### 7. Split the encoded and scaled 200k dataset into Train-Val-Test sets

In [57]:
pre_processed_df = cleaned_processed_data.copy()
X_train, y_train, X_val, y_val, X_test, y_test = split_sets_random(X_main, "beer_style", test_ratio=0.2, to_numpy=True)

In [58]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(240000, 11)
(80000, 11)
(80000, 11)


###  8. Baseline Model

In [43]:
from src.models.null import NullModel

base_model = NullModel(target_type="classification")
y_base = base_model.fit_predict(y_train)

from src.models.performance import print_reg_perf
#print_reg_perf(y_base, y_train, set_name='Training')

### 9. Load Pytorch Dataset

In [59]:
from src.models.pytorch import PytorchDataset

train_dataset = PytorchDataset(X=X_train, y=y_train)
val_dataset = PytorchDataset(X=X_val, y=y_val)
test_dataset = PytorchDataset(X=X_test, y=y_test)

In [60]:
(unique, counts) = np.unique(y_train, return_counts=True)
frequencies = np.asarray((unique, counts)).T
#print(frequencies)

### 10. Build Neural Net Model

In [124]:
import torch as torch
import torch.nn as nn
from src.models.pytorch import PytorchMultiClass
import torch.nn.functional as F
%load_ext autoreload
%autoreload 2

model = PytorchMultiClass(X_train.shape[1])

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [104]:
from src.models.pytorch import get_device
device = get_device()
model.to(device)

PytorchMultiClass(
  (layer_1): Linear(in_features=11, out_features=80, bias=True)
  (layer_2): Linear(in_features=80, out_features=100, bias=True)
  (layer_out): Linear(in_features=100, out_features=104, bias=True)
  (softmax): Softmax(dim=1)
)

In [105]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

In [107]:
from src.models.pytorch import train_classification, test_classification

N_EPOCHS = 10
BATCH_SIZE = 150

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset, model=model, criterion=criterion, optimizer=optimizer, batch_size=BATCH_SIZE, device=device)
    valid_loss, valid_acc = test_classification(val_dataset, model=model, criterion=criterion, batch_size=BATCH_SIZE, device=device)

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.1f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.1f}%')

Epoch: 0
	(train)	|	Loss: 0.0305	|	Acc: 9.1%
	(valid)	|	Loss: 0.0305	|	Acc: 9.4%
Epoch: 1
	(train)	|	Loss: 0.0305	|	Acc: 9.1%
	(valid)	|	Loss: 0.0305	|	Acc: 9.4%
Epoch: 2
	(train)	|	Loss: 0.0305	|	Acc: 9.1%
	(valid)	|	Loss: 0.0305	|	Acc: 9.4%
Epoch: 3
	(train)	|	Loss: 0.0305	|	Acc: 9.1%
	(valid)	|	Loss: 0.0305	|	Acc: 9.4%
Epoch: 4
	(train)	|	Loss: 0.0305	|	Acc: 9.1%
	(valid)	|	Loss: 0.0305	|	Acc: 9.4%
Epoch: 5
	(train)	|	Loss: 0.0305	|	Acc: 9.1%
	(valid)	|	Loss: 0.0305	|	Acc: 9.4%
Epoch: 6
	(train)	|	Loss: 0.0305	|	Acc: 9.1%
	(valid)	|	Loss: 0.0305	|	Acc: 9.4%
Epoch: 7
	(train)	|	Loss: 0.0305	|	Acc: 9.1%
	(valid)	|	Loss: 0.0305	|	Acc: 9.4%
Epoch: 8
	(train)	|	Loss: 0.0305	|	Acc: 9.1%
	(valid)	|	Loss: 0.0305	|	Acc: 9.4%
Epoch: 9
	(train)	|	Loss: 0.0305	|	Acc: 9.1%
	(valid)	|	Loss: 0.0305	|	Acc: 9.4%


In [108]:
test_loss, test_acc =  test_classification(test_dataset, model=model, criterion=criterion, batch_size=BATCH_SIZE, device=device)
print(f'\tLoss: {test_loss:4f}\t | \tAccuracy:{test_acc:.1f}')

	Loss: 0.030502	 | 	Accuracy:0.1


In [129]:
torch.save(model, "../models/pytorch_multi_beer_type_prediction_nn_pipeline.pt")

### 11. Unit Testing of API

In [122]:
import pandas as pd
import numpy as np
import torch as torch
import os
import sys
from joblib import dump, load
sys.path.append(os.path.abspath('../'))
from src.models.pytorch import get_device, predict

#### Load the saved Model
model = torch.load('../models/pytorch_multi_beer_type_prediction_nn_pipeline.pt', encoding='ascii')

### Unit Testing:  @app.post('/beer/type/many')

In [128]:
obs = pd.DataFrame({'brewery_name': ["Caldera Brewing Company", "Hello", "Piniavos Alutis", "Rio Salado Brewing Company"],
                       'review_aroma': [-1, 6, 7, 5],
                       'review_appearance': [3, 1, 4, 0],
                       'review_palate': [4,7, 2, 3],
                       'review_taste': [2,5, 1, 1]})
 
trained_encoder = load( '../models/pipeline/trained_encoder.sav')
obs_trans = trained_encoder.transform(obs)


device = get_device()
df_tensor= torch.Tensor(np.array(obs_trans)).to(device)

prediction = model(df_tensor).argmax(1)

le = load('../models/pipeline/te_pipeline.sav')
pred_name = le.inverse_transform(prediction.tolist())
pred_name

array(['Kölsch', 'Scottish Gruit / Ancient Herbed Ale', 'Weizenbock',
       'Chile Beer'], dtype=object)

### Unit Testing: @app.post('/beer/type/single')

In [117]:
from api.app.main import predict_single

result = predict_single("Yazoo Brewing Campany",  4,3 ,4,5)
print(result)

Lambic - Fruit


In [120]:
def convert_cr_to_dataframe(report_dict: {}) -> pd.DataFrame:
    """
    Converts the dictionary format of the Classification Report (CR) to a
    dataframe for easy of sorting
    :param report_dict: The dictionary returned by 
    sklearn.metrics.classification_report.
    :return: Returns a dataframe of the same information.
    """
    beer_style = list(report_dict.keys())
    beer_style.remove('accuracy')
    beer_style.remove('macro avg')
    beer_style.remove('weighted avg')
    precision = []
    recall = []
    f1 = []
    support = []
    for key, value in report_dict.items():
        if key not in ['accuracy', 'macro avg', 'weighted avg']:
            precision.append(value['precision'])
            recall.append(value['recall'])
            f1.append(value['f1-score'])
            support.append(value['support'])
    result = pd.DataFrame({'beer_style': beer_style,
                           'precision': precision,
                           'recall': recall,
                           'f1': f1,
                           'support': support})
    return result

### 12. Model Accuracy

In [125]:
from sklearn.metrics import classification_report


device = get_device()
test_df_tensor= torch.Tensor(np.array(X_test)).to(device)
test_df_tensor
    
# #  Make Prediction
prediction = model(test_df_tensor).argmax(1)
prediction

y_test
report_dict = classification_report(y_test,
                                    prediction,
                                    output_dict=True)
report_df = convert_cr_to_dataframe(report_dict)
report_df

Unnamed: 0,beer_style,precision,recall,f1,support
0,0,0.001435,0.002123,0.001712,471
1,1,0.010689,0.020702,0.014098,1739
2,2,0.000000,0.000000,0.000000,2074
3,3,0.000000,0.000000,0.000000,521
4,4,0.028182,0.036449,0.031787,1701
...,...,...,...,...,...
99,99,0.000000,0.000000,0.000000,676
100,100,0.002492,0.029091,0.004591,275
101,101,0.004644,0.023438,0.007752,128
102,102,0.000000,0.000000,0.000000,655


In [126]:
report_df.to_csv("../reports/nn_classification_report.csv")

In [138]:
print("Pandas version", pd.__version__)

Pandas version 1.1.5
