###  Create random set of Wine Features and run a Predictor Model

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

# scalar options
# https://scikit-learn.org/stable/modules/preprocessing.html
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import RobustScaler

# model options
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.gaussian_process import GaussianProcessRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from ml_metric_utils import  ordinal_predictions, regression_metrics

import random
import joblib


### Determine Min Max of Features from the source data set

##### load White wine source data set

In [2]:
# white_datafile = os.path.join("..", "data", "sourcedata", "winequality-white.csv")
# print(white_datafile)

# white_df = pd.read_csv(white_datafile, delimiter=";")
# white_df.sort_values(by=['pH'], ascending=True)

##### load Red wine dataset

In [3]:
datafile = os.path.join("..", "data", "sourcedata", "winequality-red.csv")
print(datafile)

red_df = pd.read_csv(datafile, delimiter=";")
red_df.head()
red_df.sort_values(by=['quality'], ascending=False)

..\data\sourcedata\winequality-red.csv


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
495,10.7,0.350,0.53,2.60,0.070,5.0,16.0,0.99720,3.15,0.65,11.0,8
1403,7.2,0.330,0.33,1.70,0.061,3.0,13.0,0.99600,3.23,1.10,10.0,8
390,5.6,0.850,0.05,1.40,0.045,12.0,88.0,0.99240,3.56,0.82,12.9,8
1061,9.1,0.400,0.50,1.80,0.071,7.0,16.0,0.99462,3.21,0.69,12.5,8
1202,8.6,0.420,0.39,1.80,0.068,6.0,12.0,0.99516,3.35,0.69,11.7,8
...,...,...,...,...,...,...,...,...,...,...,...,...
690,7.4,1.185,0.00,4.25,0.097,5.0,14.0,0.99660,3.63,0.54,10.7,3
1478,7.1,0.875,0.05,5.70,0.082,3.0,14.0,0.99808,3.40,0.52,10.2,3
899,8.3,1.020,0.02,3.40,0.084,6.0,11.0,0.99892,3.48,0.49,11.0,3
1299,7.6,1.580,0.00,2.10,0.137,5.0,9.0,0.99476,3.50,0.40,10.9,3


In [4]:
wine_df = red_df
wineX_df = wine_df.drop("quality", axis=1)

##### Determine Min-Max of Features

In [5]:
# wineX_df.min() 
#wineX_df.max()

In [6]:
wineX_minmax_df = pd.DataFrame(columns=['min', 'max'])
wineX_minmax_df['min'] = wineX_df.min()
wineX_minmax_df['max'] = wineX_df.max()
wineX_minmax_df['mean'] = wineX_df.mean()
wineX_minmax_df

Unnamed: 0,min,max,mean
fixed acidity,4.6,15.9,8.319637
volatile acidity,0.12,1.58,0.527821
citric acid,0.0,1.0,0.270976
residual sugar,0.9,15.5,2.538806
chlorides,0.012,0.611,0.087467
free sulfur dioxide,1.0,72.0,15.874922
total sulfur dioxide,6.0,289.0,46.467792
density,0.99007,1.00369,0.996747
pH,2.74,4.01,3.311113
sulphates,0.33,2.0,0.658149


###### generate 10000 randomly generated runs of features within min-max range

In [7]:
random.seed(778)

In [8]:
# min = 0.1200
# max = 1.5800

# buff_min = round(0.9 * min, 5)
# buff_max = round(1.1 * max, 5)

# gen_min = round(random.uniform(buff_min, buff_max), 5)
# gen_max = gen_min
# for i in np.arange(1000000) :
#     num = round(random.uniform(buff_min, buff_max), 5)
#     if num < gen_min : gen_min = num
#     if num > gen_max : gen_max = num
#     #print(num)

# print(f"(min, max) = ({min}, {max})")
# print(f"(buff_min, buff_max) = ({buff_min}, {buff_max})")
# print(f"(gen_min, gen_max) = ({gen_min}, {gen_max})")

In [9]:
def genRandomArray(min, max, size=100000, ndigits=5) :
    buff_min = round(0.9 * min, 5)
    buff_max = round(1.1 * max, 5)

    values_array = np.round(np.random.uniform(buff_min, buff_max, size=(size,)), ndigits)
    return(values_array)
 

In [10]:
arr = genRandomArray(0.0, 5.0, 30, 3)
arr

array([1.472, 1.866, 0.08 , 3.546, 1.388, 1.969, 4.388, 1.534, 4.774,
       2.46 , 0.636, 0.624, 2.512, 1.038, 4.55 , 0.959, 4.138, 4.788,
       1.928, 4.349, 1.421, 4.03 , 2.862, 2.397, 4.316, 4.47 , 2.803,
       5.252, 3.748, 0.743])

In [11]:
# run_df = pd.DataFrame(columns = wine_minmax_df.index.values)
# run_df

# run_df['fixed acidity'] = arr
# run_df.head()

In [12]:
# print(wine_minmax_df.index.values)
# print(list(wine_minmax_df.index))
size = 100000
ndigits = 5
runX_df = pd.DataFrame(columns = wineX_minmax_df.index.values)

for index in wineX_minmax_df.index.values :
    print(index)
    rndmArray = genRandomArray(wineX_minmax_df.loc[index]['min'], wineX_minmax_df.loc[index]['max'], size, ndigits)
    runX_df[index] = rndmArray
    print("-----")
        
runX_df.head()  

fixed acidity
-----
volatile acidity
-----
citric acid
-----
residual sugar
-----
chlorides
-----
free sulfur dioxide
-----
total sulfur dioxide
-----
density
-----
pH
-----
sulphates
-----
alcohol
-----


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,6.16446,0.65057,0.20184,5.71892,0.38033,71.5932,19.84775,0.94029,2.62381,1.28577,7.96969
1,4.88336,1.47224,0.35281,8.56891,0.27744,6.41023,225.07456,0.89275,3.63091,0.43773,10.80377
2,15.60216,1.08783,0.24824,12.27159,0.32392,43.44617,303.75837,1.01926,2.97618,0.85063,14.93101
3,6.33225,1.57677,0.44057,1.38408,0.63231,69.70927,213.99015,1.09275,3.88098,1.96819,11.91176
4,6.39043,0.44239,0.22265,4.87775,0.62714,37.29608,219.34475,0.96288,2.5055,1.30681,16.37453


In [25]:
runX_df.shape

(100000, 11)

#### Load a predictor model

In [13]:
#ls saved_models

In [14]:
model_filename = os.path.join(".", "saved_models", "GaussianProcessRegressor.sav")
model_yscaler_filename = os.path.join(".", "saved_models", "GaussianProcessRegressor_yscaler.sav")
model_Xscaler_filename = os.path.join(".", "saved_models", "GaussianProcessRegressor_xscaler.sav")

In [15]:
loaded_model = joblib.load(model_filename)
loaded_yscaler = joblib.load(model_yscaler_filename)
loaded_Xscaler = joblib.load(model_Xscaler_filename)

In [16]:
loaded_model

GaussianProcessRegressor(alpha=0.1, copy_X_train=True, kernel=None,
                         n_restarts_optimizer=0, normalize_y=False,
                         optimizer='fmin_l_bfgs_b', random_state=None)

In [17]:
loaded_yscaler

PowerTransformer(copy=True, method='yeo-johnson', standardize=True)

In [18]:
loaded_Xscaler

PowerTransformer(copy=True, method='yeo-johnson', standardize=True)

#### Scale the generated run X features df

In [19]:
runX_scaled_df = loaded_Xscaler.transform(runX_df)
runX_scaled_df

array([[-1.46201857,  0.74442055, -0.27455425, ..., -5.17708142,
         2.46095647, -4.2103555 ],
       [-2.86629119,  3.44735504,  0.50466585, ...,  1.95480323,
        -2.05647262,  0.55964717],
       [ 2.84362852,  2.43777235, -0.02203958, ..., -2.32257982,
         1.31503827,  2.40101006],
       ...,
       [ 3.06906893,  3.87969429,  0.65458158, ..., -0.83221591,
         3.02454006,  0.874866  ],
       [ 3.08842992, -0.07925824,  1.249671  , ..., -4.17593446,
         2.51959432, -2.47428056],
       [-2.46868151,  2.69642071, -0.11219397, ..., -1.0975553 ,
         1.80066184,  2.51093783]])

#### Generate predictions on the 10,000 runs

In [20]:
# make predictions forom the model using the test data
predictions = loaded_model.predict(runX_scaled_df)

# round the prediction floats to ordinal values
ordPredictions = ordinal_predictions(predictions, loaded_yscaler)

# create a datarframe to hold the target and predicted values
predict_df = runX_df.copy()
predict_df["predicted quality"] = ordPredictions

# sort by expected target value
predict_df.sort_values(by='predicted quality', ascending=False)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,predicted quality
0,6.16446,0.65057,0.20184,5.71892,0.38033,71.59320,19.84775,0.94029,2.62381,1.28577,7.96969,6.0
66650,9.20542,0.50569,0.49036,1.93484,0.08759,44.46255,293.67878,0.92492,3.51975,1.38972,15.57515,6.0
66672,7.28176,0.19249,0.56463,16.16263,0.07511,17.79724,171.47055,0.93078,3.77970,1.91978,13.21628,6.0
66671,5.97767,0.55343,1.06581,14.82509,0.50814,34.86926,251.20405,0.94735,3.25792,2.06155,9.71824,6.0
66670,15.44569,1.16019,0.83787,10.20677,0.12573,59.13374,54.35504,1.06117,2.52477,0.95682,10.80680,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...
33331,11.06387,0.41212,0.88298,15.08449,0.24206,6.63087,185.94571,0.91086,3.36863,1.98959,11.72421,6.0
33330,5.68350,0.61647,0.67481,15.38767,0.61728,77.31754,67.98282,0.95932,4.06699,1.27597,10.07454,6.0
33329,12.98875,0.63694,0.54806,6.63009,0.25837,53.50361,282.68875,0.99568,3.45319,0.32144,8.24235,6.0
33328,17.42332,1.43246,0.24038,4.94440,0.10383,51.72253,27.69196,1.06884,2.77670,0.52039,12.94465,6.0


In [21]:
predict_df['predicted quality'].value_counts()

6.0    100000
Name: predicted quality, dtype: int64

In [22]:
predict_df.shape

(100000, 12)

##### filter and sort the predictions (as needed)

In [23]:
predict_df.sort_values(by='predicted quality', ascending=False).tail(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,predicted quality
33336,11.32689,0.28087,0.37046,6.00856,0.18114,29.67241,316.56785,0.90543,2.74282,0.9651,16.24764,6.0
33335,5.97083,0.649,0.71405,10.30306,0.18739,70.92884,43.06386,0.99513,4.00308,0.89005,9.03862,6.0
33334,17.10396,1.25254,1.00762,14.61034,0.44051,56.4285,291.01466,0.92383,3.12858,1.05987,13.23511,6.0
33333,4.36038,0.16801,0.91118,15.069,0.58303,28.32439,144.50934,0.93917,3.24321,1.4763,12.4814,6.0
33332,6.70243,1.26995,0.05268,3.85289,0.57318,27.14072,178.68016,0.9952,2.69552,0.51301,16.27988,6.0
33331,11.06387,0.41212,0.88298,15.08449,0.24206,6.63087,185.94571,0.91086,3.36863,1.98959,11.72421,6.0
33330,5.6835,0.61647,0.67481,15.38767,0.61728,77.31754,67.98282,0.95932,4.06699,1.27597,10.07454,6.0
33329,12.98875,0.63694,0.54806,6.63009,0.25837,53.50361,282.68875,0.99568,3.45319,0.32144,8.24235,6.0
33328,17.42332,1.43246,0.24038,4.9444,0.10383,51.72253,27.69196,1.06884,2.7767,0.52039,12.94465,6.0
99999,5.20502,1.17488,0.23141,1.54851,0.62364,16.27579,257.24686,1.03447,3.14632,0.98468,15.49174,6.0


##### save the run file for visualization in Tableau

In [24]:
predict_file_path = os.path.join("..", "data", "appdata", "redwine_monkey_predictions.csv")

predict_df.to_csv(predict_file_path, index=False, header=True)