###  Create a Support Vector Machine regressor to predict wine quality from profile of chemical attributes of the wine.

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

# scalar options
# https://scikit-learn.org/stable/modules/preprocessing.html
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import RobustScaler

# model options
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.gaussian_process import GaussianProcessRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from ml_metric_utils import  ordinal_predictions, regression_metrics

import random
import joblib


### Determine Min Max of Features from the source data set

##### load White wine source data set

In [2]:
# white_datafile = os.path.join("..", "data", "sourcedata", "winequality-white.csv")
# print(white_datafile)

# white_df = pd.read_csv(white_datafile, delimiter=";")
# white_df.sort_values(by=['pH'], ascending=True)

##### load Red wine dataset

In [3]:
datafile = os.path.join("..", "data", "sourcedata", "winequality-red.csv")
print(datafile)

red_df = pd.read_csv(datafile, delimiter=";")
red_df.head()
red_df.sort_values(by=['quality'], ascending=False)

..\data\sourcedata\winequality-red.csv


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
495,10.7,0.350,0.53,2.60,0.070,5.0,16.0,0.99720,3.15,0.65,11.0,8
1403,7.2,0.330,0.33,1.70,0.061,3.0,13.0,0.99600,3.23,1.10,10.0,8
390,5.6,0.850,0.05,1.40,0.045,12.0,88.0,0.99240,3.56,0.82,12.9,8
1061,9.1,0.400,0.50,1.80,0.071,7.0,16.0,0.99462,3.21,0.69,12.5,8
1202,8.6,0.420,0.39,1.80,0.068,6.0,12.0,0.99516,3.35,0.69,11.7,8
...,...,...,...,...,...,...,...,...,...,...,...,...
690,7.4,1.185,0.00,4.25,0.097,5.0,14.0,0.99660,3.63,0.54,10.7,3
1478,7.1,0.875,0.05,5.70,0.082,3.0,14.0,0.99808,3.40,0.52,10.2,3
899,8.3,1.020,0.02,3.40,0.084,6.0,11.0,0.99892,3.48,0.49,11.0,3
1299,7.6,1.580,0.00,2.10,0.137,5.0,9.0,0.99476,3.50,0.40,10.9,3


In [4]:
wine_df = red_df
wineX_df = wine_df.drop("quality", axis=1)

##### Determine Min-Max of Features

In [5]:
# wineX_df.min() 
#wineX_df.max()

In [6]:
wineX_minmax_df = pd.DataFrame(columns=['min', 'max'])
wineX_minmax_df['min'] = wineX_df.min()
wineX_minmax_df['max'] = wineX_df.max()
wineX_minmax_df['mean'] = wineX_df.mean()
wineX_minmax_df

Unnamed: 0,min,max,mean
fixed acidity,4.6,15.9,8.319637
volatile acidity,0.12,1.58,0.527821
citric acid,0.0,1.0,0.270976
residual sugar,0.9,15.5,2.538806
chlorides,0.012,0.611,0.087467
free sulfur dioxide,1.0,72.0,15.874922
total sulfur dioxide,6.0,289.0,46.467792
density,0.99007,1.00369,0.996747
pH,2.74,4.01,3.311113
sulphates,0.33,2.0,0.658149


###### generate 10000 randomly generated runs of features within min-max range

In [7]:
random.seed(778)

In [8]:
# min = 0.1200
# max = 1.5800

# buff_min = round(0.9 * min, 5)
# buff_max = round(1.1 * max, 5)

# gen_min = round(random.uniform(buff_min, buff_max), 5)
# gen_max = gen_min
# for i in np.arange(1000000) :
#     num = round(random.uniform(buff_min, buff_max), 5)
#     if num < gen_min : gen_min = num
#     if num > gen_max : gen_max = num
#     #print(num)

# print(f"(min, max) = ({min}, {max})")
# print(f"(buff_min, buff_max) = ({buff_min}, {buff_max})")
# print(f"(gen_min, gen_max) = ({gen_min}, {gen_max})")

In [9]:
def genRandomArray(min, max, size=100000, ndigits=5) :
    buff_min = round(0.9 * min, 5)
    buff_max = round(1.1 * max, 5)

    values_array = np.round(np.random.uniform(buff_min, buff_max, size=(size,)), ndigits)
    return(values_array)
 

In [10]:
arr = genRandomArray(0.0, 5.0, 30, 3)
arr

array([4.137e+00, 2.268e+00, 3.466e+00, 3.592e+00, 4.000e-03, 2.017e+00,
       3.658e+00, 4.782e+00, 1.497e+00, 1.405e+00, 4.023e+00, 2.130e+00,
       1.983e+00, 2.548e+00, 2.000e+00, 2.325e+00, 5.427e+00, 4.568e+00,
       2.393e+00, 2.238e+00, 4.580e+00, 3.200e-01, 4.276e+00, 2.703e+00,
       1.001e+00, 4.060e-01, 4.236e+00, 2.878e+00, 1.659e+00, 5.019e+00])

In [11]:
# run_df = pd.DataFrame(columns = wine_minmax_df.index.values)
# run_df

# run_df['fixed acidity'] = arr
# run_df.head()

In [12]:
# print(wine_minmax_df.index.values)
# print(list(wine_minmax_df.index))
size = 100000
ndigits = 5
runX_df = pd.DataFrame(columns = wineX_minmax_df.index.values)

for index in wineX_minmax_df.index.values :
    print(index)
    rndmArray = genRandomArray(wineX_minmax_df.loc[index]['min'], wineX_minmax_df.loc[index]['max'], size, ndigits)
    runX_df[index] = rndmArray
    print("-----")
        
runX_df.head()  

fixed acidity
-----
volatile acidity
-----
citric acid
-----
residual sugar
-----
chlorides
-----
free sulfur dioxide
-----
total sulfur dioxide
-----
density
-----
pH
-----
sulphates
-----
alcohol
-----


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,16.88367,0.74275,0.59956,10.17939,0.28702,68.212,39.65329,0.89609,4.16375,0.49392,11.36459
1,10.38246,0.47589,0.04057,11.0212,0.5326,22.3314,134.16155,0.97684,4.24965,1.79703,12.06002
2,4.59328,0.70422,0.33878,14.18142,0.19556,39.07439,260.8167,0.89405,4.25018,1.5458,9.07117
3,11.71718,1.61685,0.33457,15.43051,0.4108,15.83092,80.32981,0.98984,4.25346,0.68553,11.69098
4,4.79794,1.54641,0.1753,7.65975,0.11071,67.40123,83.2166,1.00888,4.11079,1.49208,8.96391


#### Load a predictor model

In [13]:
#ls saved_models

In [14]:
model_filename = os.path.join(".", "saved_models", "GaussianProcessRegressor.sav")
model_yscaler_filename = os.path.join(".", "saved_models", "GaussianProcessRegressor_yscaler.sav")
model_Xscaler_filename = os.path.join(".", "saved_models", "GaussianProcessRegressor_xscaler.sav")

In [15]:
loaded_model = joblib.load(model_filename)
loaded_yscaler = joblib.load(model_yscaler_filename)
loaded_Xscaler = joblib.load(model_Xscaler_filename)

In [16]:
loaded_model

GaussianProcessRegressor(alpha=0.1, copy_X_train=True, kernel=None,
                         n_restarts_optimizer=0, normalize_y=False,
                         optimizer='fmin_l_bfgs_b', random_state=None)

In [17]:
loaded_yscaler

PowerTransformer(copy=True, method='yeo-johnson', standardize=True)

In [18]:
loaded_Xscaler

PowerTransformer(copy=True, method='yeo-johnson', standardize=True)

#### Scale the generated run X features df

In [19]:
runX_scaled_df = loaded_Xscaler.transform(runX_df)
runX_scaled_df

array([[ 3.121768  ,  1.16927101,  1.56100123, ...,  4.72410743,
        -1.31322805,  0.99093896],
       [ 1.20058517, -0.20014352, -1.26210534, ...,  5.12418542,
         2.9559268 ,  1.41390309],
       [-3.25712598,  0.99706603,  0.4371379 , ...,  5.12661826,
         2.77183612, -1.61228029],
       ...,
       [-0.43815019, -0.94144521,  0.84800151, ..., -1.12367354,
         1.66863767,  0.72883357],
       [ 2.16453116,  0.758722  ,  2.56356305, ...,  5.115001  ,
         0.41877257, -1.29195381],
       [ 2.10775245,  3.74930513,  1.08801337, ...,  0.52985649,
         2.95464098,  0.80151576]])

#### Generate predictions on the 10,000 runs

In [20]:
# make predictions forom the model using the test data
predictions = loaded_model.predict(runX_scaled_df)

# round the prediction floats to ordinal values
ordPredictions = ordinal_predictions(predictions, loaded_yscaler)

# create a datarframe to hold the target and predicted values
predict_df = runX_df.copy()
predict_df["predicted quality"] = ordPredictions

# sort by expected target value
predict_df.sort_values(by='predicted quality', ascending=False)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,predicted quality
0,16.88367,0.74275,0.59956,10.17939,0.28702,68.21200,39.65329,0.89609,4.16375,0.49392,11.36459,6.0
66650,13.03925,0.10952,0.61511,12.88416,0.34516,19.35034,98.54569,1.07353,2.49382,2.11265,11.60189,6.0
66672,10.72284,1.62629,0.36210,13.53853,0.17017,49.91126,160.52223,1.06029,4.18656,1.49388,8.73792,6.0
66671,16.42691,1.71204,0.70622,14.22943,0.66001,3.10148,246.47987,1.09673,3.58928,1.07990,11.87830,6.0
66670,14.76127,0.11429,0.89458,10.01845,0.08725,9.44159,150.63057,0.94543,4.09096,2.16247,15.93727,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...
33331,12.38302,1.11545,0.80968,14.21660,0.40821,28.76051,18.85405,0.93821,3.11836,1.09762,16.31352,6.0
33330,7.69600,1.25891,0.68897,1.17289,0.08699,13.69928,166.09371,1.01308,4.39723,0.96558,7.91104,6.0
33329,13.01052,1.31182,0.99282,11.23976,0.17896,48.85006,160.56204,0.96158,3.18085,1.85593,10.49312,6.0
33328,14.13985,0.77996,0.28447,9.93341,0.53587,70.86088,69.28076,0.96956,2.83216,2.11387,11.64182,6.0


In [21]:
predict_df['predicted quality'].value_counts()

6.0    100000
Name: predicted quality, dtype: int64

In [24]:
predict_df.shape

(100000, 12)

##### filter and sort the predictions (as needed)

In [22]:
predict_df.sort_values(by='predicted quality', ascending=False).tail(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,predicted quality
33336,5.95855,1.55225,0.472,7.27771,0.49084,48.86732,160.89371,0.98291,3.84578,0.79207,12.3269,6.0
33335,9.34696,0.14227,0.63048,9.6166,0.3904,75.1702,198.26659,1.07804,3.6706,2.12712,14.33185,6.0
33334,9.39332,0.83248,0.46237,10.33133,0.1413,62.42028,125.6364,0.93642,4.18514,1.53253,15.83186,6.0
33333,4.18856,1.21424,0.47026,12.51988,0.52575,29.00759,34.14457,0.91599,3.05666,1.39189,7.67596,6.0
33332,15.30872,0.95337,0.3415,11.85315,0.3246,29.27972,250.33564,0.93388,3.24307,1.14312,14.76083,6.0
33331,12.38302,1.11545,0.80968,14.2166,0.40821,28.76051,18.85405,0.93821,3.11836,1.09762,16.31352,6.0
33330,7.696,1.25891,0.68897,1.17289,0.08699,13.69928,166.09371,1.01308,4.39723,0.96558,7.91104,6.0
33329,13.01052,1.31182,0.99282,11.23976,0.17896,48.85006,160.56204,0.96158,3.18085,1.85593,10.49312,6.0
33328,14.13985,0.77996,0.28447,9.93341,0.53587,70.86088,69.28076,0.96956,2.83216,2.11387,11.64182,6.0
99999,12.85688,1.61498,0.48238,9.45251,0.11704,38.22942,27.53648,0.97026,3.39298,1.79482,11.10328,6.0


##### save the run file for visualization in Tableau

In [23]:
predict_file_path = os.path.join("..", "data", "appdata", "redwine_monkey_predictions.csv")

predict_df.to_csv(predict_file_path, index=False, header=True)