In [1]:
# DS Modules
import numpy as np
import pandas as pd

# Visualization modules
import matplotlib.pyplot as plt
import seaborn as sns

# Helpers
import helpers
import wrangle
import model

In this notebook we will be looking at the prediction problem through the lense of a classification modeling problem.  To tackle this problem we will be taking a few different approaches.

- Treating the quality score as a category
- Treating quality scores 7, 8, and 9 as a "good wine" category
- Treating quality scores less than 5 as a "bad wines" category


In [2]:
r = wrangle.wrangle_data('red')
tr, sr, vr = helpers.prep.train_test_validate_split(r)

In [3]:
w = wrangle.wrangle_data('white')
tw, sw, vw = helpers.prep.train_test_validate_split(w)

In [4]:
frames = [tr, sr, vr, tw, sw, vw]
reds = [tr, sr, vr]
whites = [tw, sw, vw]

### Clusters

In [5]:
rclst = model.FlavorProfile()
rclst.fit(tr)
tr['flavor_profile'] = rclst.predict(tr)
sr['flavor_profile'] = rclst.predict(sr)
vr['flavor_profile'] = rclst.predict(vr)
wclst = model.FlavorProfile()
wclst.fit(tw)
tw['flavor_profile'] = rclst.predict(tw)
sw['flavor_profile'] = rclst.predict(sw)
vw['flavor_profile'] = rclst.predict(vw)

### Vinegar categories


In [6]:
bins = [-np.inf, 0.6, 1.2, np.inf]
labels = ['undetectable','noticeable','strong']

for df in frames:
    df['flavor_vinegar'] = pd.cut(df.volatile_acidity, bins=bins, labels=labels)

### One hot encoding pipeline

In [7]:
target = 'quality'
cat_feats = [
    'flavor_profile',
    'flavor_vinegar',
]
num_feats = [
    'fixed_acidity', 
    'volatile_acidity', 
    'citric_acid', 
    'residual_sugar',
    'chlorides', 
    'free_sulfur_dioxide', 
    'total_sulfur_dioxide', 
    'density',
    'ph', 
    'sulphates', 
    'alcohol',
]

In [8]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(tr[cat_feats])
temp = encoder.transform(tr[cat_feats])

In [9]:
temp.todense()

matrix([[0., 0., 1., ..., 1., 0., 0.],
        [0., 1., 0., ..., 0., 0., 1.],
        [0., 1., 0., ..., 0., 0., 1.],
        ...,
        [0., 1., 0., ..., 0., 0., 1.],
        [1., 0., 0., ..., 0., 0., 1.],
        [1., 0., 0., ..., 0., 0., 1.]])

In [10]:
encoder.get_feature_names_out()

array(['flavor_profile_0', 'flavor_profile_1', 'flavor_profile_2',
       'flavor_profile_3', 'flavor_vinegar_noticeable',
       'flavor_vinegar_strong', 'flavor_vinegar_undetectable'],
      dtype=object)

In [11]:
cats = pd.DataFrame(temp.todense(),columns = encoder.get_feature_names_out())
cats

Unnamed: 0,flavor_profile_0,flavor_profile_1,flavor_profile_2,flavor_profile_3,flavor_vinegar_noticeable,flavor_vinegar_strong,flavor_vinegar_undetectable
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...
810,0.0,0.0,1.0,0.0,0.0,0.0,1.0
811,1.0,0.0,0.0,0.0,0.0,0.0,1.0
812,0.0,1.0,0.0,0.0,0.0,0.0,1.0
813,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [12]:
tr.index

Int64Index([  38,   54,  138,  779, 1559,  248,  261,  908,  640,  389,
            ...
             127,  970, 1302,   52, 1000,  152, 1156,  149,  413,  518],
           dtype='int64', length=815)

In [13]:
xf = pd.concat([tr[num_feats], cats.set_index(tr.index)], axis=1)

In [14]:
scaler = helpers.prep.train_scaler(xf)

xf = helpers.prep.scale_df(xf, scaler)

In [15]:
xf.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,flavor_profile_0,flavor_profile_1,flavor_profile_2,flavor_profile_3,flavor_vinegar_noticeable,flavor_vinegar_strong,flavor_vinegar_undetectable
38,0.089286,0.82906,0.09,0.041096,0.239168,0.084507,0.045936,0.300459,0.598425,0.067485,0.215385,0.0,0.0,1.0,0.0,1.0,0.0,0.0
54,0.258929,0.299145,0.15,0.130137,0.131716,0.450704,0.236749,0.415138,0.338583,0.159509,0.276923,0.0,1.0,0.0,0.0,0.0,0.0,1.0
138,0.276786,0.34188,0.19,0.082192,0.081456,0.197183,0.349823,0.468654,0.464567,0.104294,0.169231,0.0,1.0,0.0,0.0,0.0,0.0,1.0
779,0.214286,0.307692,0.03,0.116438,0.07279,0.28169,0.303887,0.56422,0.598425,0.141104,0.215385,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1559,0.276786,0.376068,0.26,0.075342,0.079723,0.422535,0.441696,0.470183,0.370079,0.092025,0.230769,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [16]:
config_obj = {
    'target' : 'quality',
    'cat_feats' : [
        'flavor_profile',
        'flavor_vinegar',
    ],
    'num_feats' : [
        'fixed_acidity', 
        'volatile_acidity', 
        'citric_acid', 
        'residual_sugar',
        'chlorides', 
        'free_sulfur_dioxide', 
        'total_sulfur_dioxide', 
        'density',
        'ph', 
        'sulphates', 
        'alcohol',
    ],
}

def prep_frame(df, encoder, scaler, config):
    '''Converts a raw dataframe to a dictionary with model ready 'X' and 'y' variables.
    '''
    target = config['target']
    cat_feats = config['cat_feats']
    num_feats = config['num_feats']
    
    y = df[target]

    temp = encoder.transform(df[cat_feats])
    cats = pd.DataFrame(temp.todense(),columns = encoder.get_feature_names_out())
    xf = pd.concat([df[num_feats], cats.set_index(df.index)], axis=1)

    xf = helpers.prep.scale_df(xf, scaler)

    out = {
        'X' : xf,
        'y' : y
    }

    return out

In [17]:
d = prep_frame(tr, encoder, scaler, config_obj)
d['X']

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,flavor_profile_0,flavor_profile_1,flavor_profile_2,flavor_profile_3,flavor_vinegar_noticeable,flavor_vinegar_strong,flavor_vinegar_undetectable
38,0.089286,0.829060,0.09,0.041096,0.239168,0.084507,0.045936,0.300459,0.598425,0.067485,0.215385,0.0,0.0,1.0,0.0,1.0,0.0,0.0
54,0.258929,0.299145,0.15,0.130137,0.131716,0.450704,0.236749,0.415138,0.338583,0.159509,0.276923,0.0,1.0,0.0,0.0,0.0,0.0,1.0
138,0.276786,0.341880,0.19,0.082192,0.081456,0.197183,0.349823,0.468654,0.464567,0.104294,0.169231,0.0,1.0,0.0,0.0,0.0,0.0,1.0
779,0.214286,0.307692,0.03,0.116438,0.072790,0.281690,0.303887,0.564220,0.598425,0.141104,0.215385,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1559,0.276786,0.376068,0.26,0.075342,0.079723,0.422535,0.441696,0.470183,0.370079,0.092025,0.230769,0.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,0.250000,0.376068,0.03,0.061644,0.105719,0.338028,0.328622,0.376911,0.480315,0.104294,0.261538,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1156,0.339286,0.017094,0.51,0.058219,0.064125,0.619718,0.289753,0.395260,0.464567,0.239264,0.523077,1.0,0.0,0.0,0.0,0.0,0.0,1.0
149,0.312500,0.205128,0.44,0.130137,0.095321,0.140845,0.130742,0.568043,0.622047,0.147239,0.323077,0.0,1.0,0.0,0.0,0.0,0.0,1.0
413,0.464286,0.205128,0.53,0.397260,0.109185,0.070423,0.045936,0.652141,0.417323,0.276074,0.507692,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [18]:
v = prep_frame(vr, encoder, scaler, config_obj)

# Red wine modeling

### Red Fitting pipeline

In [19]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from xgboost import XGBClassifier
from xgboost import XGBRFClassifier
from helpers.eval import BaselineModel

models = {
    "Mode" : BaselineModel(method='mode'),
    # "KNN - 3": KNeighborsClassifier(n_neighbors=3),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Linear SVM": SVC(C=0.025, kernel="linear"),
    "RBF SVM": SVC(C=1, gamma=2),
    "Gaussian Process": GaussianProcessClassifier(kernel=1**2 * RBF(length_scale=1)),
    # "Decision Tree": DecisionTreeClassifier(max_depth=5),
    # "Random Forest": RandomForestClassifier(max_depth=5, max_features=1, n_estimators=10),
    # "XG Boost": XGBClassifier(),
    # "XGRF": XGBRFClassifier(),
    "Neural Net": MLPClassifier(alpha=1, max_iter=1000),
    "AdaBoost": AdaBoostClassifier(),
    "Naive Bayes": GaussianNB(),
    # "QDA": QuadraticDiscriminantAnalysis(),
    }

In [20]:
for k in range(1,16):
    models[f"KNN_{k}"] = KNeighborsClassifier(n_neighbors=k)
for k in range(1,16):
    models[f'Decision Tree_{k}'] = DecisionTreeClassifier(max_depth=k)
for k in range(1,16):
    models[f"Random Forest_{k}"] = RandomForestClassifier(max_depth=k)

In [21]:
import time
for name, model in models.items():
    # time.sleep(1)
    start_time = time.time()
    model.fit(d['X'], d['y'])
    stop_time = time.time()
    print(f'Fit {name} in {stop_time-start_time}')

Fit Mode in 0.001001119613647461
Fit Logistic Regression in 0.05899691581726074
Fit Linear SVM in 0.031000375747680664
Fit RBF SVM in 0.05899763107299805
Fit Gaussian Process in 46.097307205200195
Fit Neural Net in 1.0374398231506348
Fit AdaBoost in 0.11599874496459961
Fit Naive Bayes in 0.003000020980834961
Fit KNN_1 in 0.0020034313201904297
Fit KNN_2 in 0.001996755599975586
Fit KNN_3 in 0.0020003318786621094
Fit KNN_4 in 0.0009999275207519531
Fit KNN_5 in 0.002000570297241211
Fit KNN_6 in 0.0009999275207519531
Fit KNN_7 in 0.001999378204345703
Fit KNN_8 in 0.001001596450805664
Fit KNN_9 in 0.001998424530029297
Fit KNN_10 in 0.0009999275207519531
Fit KNN_11 in 0.0020017623901367188
Fit KNN_12 in 0.0019981861114501953
Fit KNN_13 in 0.0010001659393310547
Fit KNN_14 in 0.002000570297241211
Fit KNN_15 in 0.0009996891021728516
Fit Decision Tree_1 in 0.0020003318786621094
Fit Decision Tree_2 in 0.0029993057250976562
Fit Decision Tree_3 in 0.002000093460083008
Fit Decision Tree_4 in 0.004000

### Red Scoring

In [22]:
from sklearn.metrics import accuracy_score
scores = {}
for name, model in models.items():
    # time.sleep(1)
    start_time = time.time()
    tp = model.predict(d['X'])
    vp = model.predict(v['X'])
    scores[name] = {
        'train': accuracy_score(d['y'], tp),
        'validate': accuracy_score(v['y'], vp)
    }
    stop_time = time.time()
    print(f'{name} predicted in {stop_time-start_time}')

Mode predicted in 0.001001596450805664
Logistic Regression predicted in 0.004998683929443359
Linear SVM predicted in 0.03099799156188965
RBF SVM predicted in 0.10199761390686035
Gaussian Process predicted in 0.7370045185089111
Neural Net predicted in 0.005999326705932617
AdaBoost predicted in 0.03599810600280762
Naive Bayes predicted in 0.005000114440917969
KNN_1 predicted in 0.059000253677368164
KNN_2 predicted in 0.05699920654296875
KNN_3 predicted in 0.05900216102600098
KNN_4 predicted in 0.05899834632873535
KNN_5 predicted in 0.05800199508666992
KNN_6 predicted in 0.05705380439758301
KNN_7 predicted in 0.06000256538391113
KNN_8 predicted in 0.05899977684020996
KNN_9 predicted in 0.0599977970123291
KNN_10 predicted in 0.059000492095947266
KNN_11 predicted in 0.06099963188171387
KNN_12 predicted in 0.06000471115112305
KNN_13 predicted in 0.0599973201751709
KNN_14 predicted in 0.059998512268066406
KNN_15 predicted in 0.06199932098388672
Decision Tree_1 predicted in 0.00299715995788574

In [23]:
scores_df = pd.DataFrame.from_dict(scores, orient='index')
scores_df.shape

(53, 2)

In [24]:
scores_df.sort_values('validate', ascending=False)

Unnamed: 0,train,validate
Random Forest_5,0.680982,0.610294
Random Forest_12,0.997546,0.602941
Random Forest_11,0.992638,0.602941
Random Forest_10,0.97546,0.602941
Random Forest_14,1.0,0.595588
Random Forest_9,0.944785,0.591912
Random Forest_6,0.749693,0.591912
Random Forest_15,1.0,0.588235
Random Forest_4,0.638037,0.584559
Random Forest_13,1.0,0.580882
