###  Create random set of Wine Features and run a Predictor Model

In [23]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

# scalar options
# https://scikit-learn.org/stable/modules/preprocessing.html
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import RobustScaler

# model options
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.gaussian_process import GaussianProcessRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from ml_metric_utils import  ordinal_predictions, regression_metrics

import keras
from keras import models
from keras import utils

import random
import joblib


### Determine Min Max of Features from the source data set

##### load White wine source data set

In [2]:
# white_datafile = os.path.join("..", "data", "sourcedata", "winequality-white.csv")
# print(white_datafile)

# white_df = pd.read_csv(white_datafile, delimiter=";")
# white_df.sort_values(by=['pH'], ascending=True)

##### load Red wine dataset

In [3]:
datafile = os.path.join("..", "..", "datasets", "winequality-red.csv")
print(datafile)

red_df = pd.read_csv(datafile, delimiter=";")
red_df

..\..\datasets\winequality-red.csv


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [4]:
red_df.head()
red_df.sort_values(by=['quality'], ascending=False)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
495,10.7,0.350,0.53,2.60,0.070,5.0,16.0,0.99720,3.15,0.65,11.0,8
1403,7.2,0.330,0.33,1.70,0.061,3.0,13.0,0.99600,3.23,1.10,10.0,8
390,5.6,0.850,0.05,1.40,0.045,12.0,88.0,0.99240,3.56,0.82,12.9,8
1061,9.1,0.400,0.50,1.80,0.071,7.0,16.0,0.99462,3.21,0.69,12.5,8
1202,8.6,0.420,0.39,1.80,0.068,6.0,12.0,0.99516,3.35,0.69,11.7,8
...,...,...,...,...,...,...,...,...,...,...,...,...
690,7.4,1.185,0.00,4.25,0.097,5.0,14.0,0.99660,3.63,0.54,10.7,3
1478,7.1,0.875,0.05,5.70,0.082,3.0,14.0,0.99808,3.40,0.52,10.2,3
899,8.3,1.020,0.02,3.40,0.084,6.0,11.0,0.99892,3.48,0.49,11.0,3
1299,7.6,1.580,0.00,2.10,0.137,5.0,9.0,0.99476,3.50,0.40,10.9,3


In [5]:
wine_df = red_df
wineX_df = wine_df.drop("quality", axis=1)

##### Determine Min-Max of Features

In [6]:
# wineX_df.min() 
#wineX_df.max()

In [7]:
wineX_minmax_df = pd.DataFrame(columns=['min', 'max'])
wineX_minmax_df['min'] = wineX_df.min()
wineX_minmax_df['max'] = wineX_df.max()
wineX_minmax_df['mean'] = wineX_df.mean()
wineX_minmax_df

Unnamed: 0,min,max,mean
fixed acidity,4.6,15.9,8.319637
volatile acidity,0.12,1.58,0.527821
citric acid,0.0,1.0,0.270976
residual sugar,0.9,15.5,2.538806
chlorides,0.012,0.611,0.087467
free sulfur dioxide,1.0,72.0,15.874922
total sulfur dioxide,6.0,289.0,46.467792
density,0.99007,1.00369,0.996747
pH,2.74,4.01,3.311113
sulphates,0.33,2.0,0.658149


###### generate 10000 randomly generated runs of features within min-max range

In [8]:
random.seed(778)

In [9]:
# min = 0.1200
# max = 1.5800

# buff_min = round(0.9 * min, 5)
# buff_max = round(1.1 * max, 5)

# gen_min = round(random.uniform(buff_min, buff_max), 5)
# gen_max = gen_min
# for i in np.arange(1000000) :
#     num = round(random.uniform(buff_min, buff_max), 5)
#     if num < gen_min : gen_min = num
#     if num > gen_max : gen_max = num
#     #print(num)

# print(f"(min, max) = ({min}, {max})")
# print(f"(buff_min, buff_max) = ({buff_min}, {buff_max})")
# print(f"(gen_min, gen_max) = ({gen_min}, {gen_max})")

In [10]:
def genRandomArray(min, max, size=100000, ndigits=5) :
    buff_min = round(0.9 * min, 5)
    buff_max = round(1.1 * max, 5)

    values_array = np.round(np.random.uniform(buff_min, buff_max, size=(size,)), ndigits)
    return(values_array)
 

In [11]:
arr = genRandomArray(0.0, 5.0, 30, 3)
arr

array([2.16 , 2.569, 1.595, 1.978, 1.651, 5.129, 5.092, 2.171, 1.749,
       5.246, 4.016, 1.02 , 4.04 , 0.993, 2.988, 0.726, 0.056, 1.788,
       0.678, 0.622, 2.346, 0.993, 2.994, 0.218, 1.751, 1.972, 0.725,
       3.916, 1.752, 0.796])

In [12]:
# run_df = pd.DataFrame(columns = wine_minmax_df.index.values)
# run_df

# run_df['fixed acidity'] = arr
# run_df.head()

In [13]:
# print(wine_minmax_df.index.values)
# print(list(wine_minmax_df.index))
size = 100000
ndigits = 5
runX_df = pd.DataFrame(columns = wineX_minmax_df.index.values)

for index in wineX_minmax_df.index.values :
    print(index)
    rndmArray = genRandomArray(wineX_minmax_df.loc[index]['min'], wineX_minmax_df.loc[index]['max'], size, ndigits)
    runX_df[index] = rndmArray
    print("-----")
        
runX_df.head()  

fixed acidity
-----
volatile acidity
-----
citric acid
-----
residual sugar
-----
chlorides
-----
free sulfur dioxide
-----
total sulfur dioxide
-----
density
-----
pH
-----
sulphates
-----
alcohol
-----


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,15.19508,1.10358,0.14102,12.24102,0.26431,10.55206,18.48759,1.04459,3.54733,1.35479,15.56626
1,7.28475,1.52646,0.48575,15.98331,0.55236,67.9251,80.19174,0.90016,3.272,0.66085,15.92233
2,7.65569,1.28692,0.89621,12.98432,0.39838,13.24929,24.79267,1.00191,3.13949,0.45378,11.5488
3,15.60801,1.21689,0.65451,6.80878,0.12413,45.585,231.50666,0.90559,3.41535,0.7812,15.6739
4,9.8569,1.32926,0.55036,6.94093,0.4089,4.52937,300.78027,1.08091,3.02943,0.42811,8.81274


In [14]:
runX_df.shape

(100000, 11)

#### Load a predictor model

In [71]:
#ls saved_models

In [16]:
# model_filename = os.path.join(".", "saved_models", "GaussianProcessRegressor.sav")
# model_yscaler_filename = os.path.join(".", "saved_models", "GaussianProcessRegressor_yscaler.sav")
# model_Xscaler_filename = os.path.join(".", "saved_models", "GaussianProcessRegressor_xscaler.sav")

In [25]:
model_filename = os.path.join(".", "saved_models", "jc_redwine_first_NN.h5")
model_yscaler_filename = os.path.join(".", "saved_models", "jc_redwine_first_NN_yscaler.sav")
model_Xscaler_filename = os.path.join(".", "saved_models", "jc_redwine_first_NN_xscaler.sav")

print(model_filename)

.\saved_models\jc_redwine_first_NN.h5


In [26]:
loaded_model = models.load_model(model_filename)
loaded_yscaler = joblib.load(model_yscaler_filename)
loaded_Xscaler = joblib.load(model_Xscaler_filename)

In [27]:
loaded_model

<keras.engine.sequential.Sequential at 0x21a8ce131d0>

In [28]:
loaded_yscaler

PowerTransformer(copy=True, method='yeo-johnson', standardize=True)

In [29]:
loaded_Xscaler

PowerTransformer(copy=True, method='yeo-johnson', standardize=True)

#### Scale the generated run X features df

In [30]:
runX_scaled_df = loaded_Xscaler.transform(runX_df)
runX_scaled_df

array([[ 2.57442991,  2.6332635 , -0.61265408, ...,  1.49807093,
         2.43125817,  2.64847478],
       [-0.53100014,  3.83979922,  1.09981721, ..., -0.24320145,
         0.26599517,  2.71407206],
       [-0.25668806,  3.20406944,  2.52968591, ..., -1.13939031,
        -1.88518065,  1.15238424],
       ...,
       [ 0.42653853,  2.70183302,  0.79391841, ..., -0.34397732,
        -0.17109801, -3.24789859],
       [ 2.69721612,  0.72813759, -1.02023743, ..., -1.02118068,
        -1.47128191,  2.38704839],
       [ 2.48392341,  1.35607388, -0.85273838, ...,  2.16578088,
         2.82167914, -0.96512957]])

#### Generate predictions on the 10,000 runs

In [61]:
##  FOR REGRESSORS

# # make predictions forom the model using the test data
# predictions = loaded_model.predict(runX_scaled_df)
# # round the prediction floats to ordinal values
# ordPredictions = ordinal_predictions(predictions, loaded_yscaler)

# # create a datarframe to hold the target and predicted values
# predict_df = runX_df.copy()
# predict_df["predicted quality"] = ordPredictions

# # sort by expected target value
# predict_df.sort_values(by='predicted quality', ascending=False)

In [65]:
##  FOR NN Classsifier

# make predictions forom the model using the test data
predictions = loaded_model.predict_classes(runX_scaled_df)

# round the prediction floats to ordinal values
# ordPredictions = ordinal_predictions(predictions, loaded_yscaler)

# create a datarframe to hold the target and predicted values
predict_df = runX_df.copy()
predict_df["predicted quality"] = predictions

# sort by expected target value
predict_df.sort_values(by='predicted quality', ascending=False)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,predicted quality
0,15.19508,1.10358,0.14102,12.24102,0.26431,10.55206,18.48759,1.04459,3.54733,1.35479,15.56626,7
90176,13.49993,1.27806,0.64885,14.70866,0.11995,47.78011,162.00242,0.98262,3.28921,1.77841,14.62516,7
90181,5.97102,0.41715,0.33019,17.04567,0.18807,13.93287,17.25507,0.96762,3.74285,2.19580,10.22979,7
90189,14.32041,1.11615,0.88038,9.53528,0.09161,66.26594,287.77895,0.99497,3.42117,2.18604,11.50388,7
43424,15.67387,0.77116,0.64879,8.32759,0.32439,40.27974,229.79205,0.99731,3.48884,1.93425,14.70079,7
...,...,...,...,...,...,...,...,...,...,...,...,...
8772,14.38575,1.01724,0.35253,2.88891,0.44018,15.07970,235.87392,0.96442,4.11878,2.06826,11.16607,4
60599,16.37673,1.24967,0.67984,8.68626,0.24408,74.57658,228.75980,0.96065,3.77607,0.86087,14.18945,4
8774,16.35027,1.21685,0.99630,11.59849,0.46953,48.77357,75.15372,1.03293,4.30967,1.21039,9.59516,4
39005,12.49315,1.58823,0.25996,13.92245,0.27224,2.18454,259.75449,0.96144,4.02927,2.08232,11.37349,4


In [66]:
predict_df['predicted quality'].value_counts()

6    47766
5    33490
4    15123
7     3621
Name: predicted quality, dtype: int64

In [67]:
predict_df.shape

(100000, 12)

##### filter and sort the predictions (as needed)

In [70]:
#predict_df.sort_values(by='predicted quality', ascending=False).tail(10)

##### save the run file for visualization in Tableau

In [69]:
predict_file_path = os.path.join("..", "data", "appdata", "redwine_monkey_predictions.csv")

predict_df.to_csv(predict_file_path, index=False, header=True)