## Execute this ONLY if you want Full Width on Jupyter Notebook's Cells

In [93]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## Importing Essential Libraries and Modules

In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

### Defining Cache Directories

In [2]:
np_cache_dir = os.path.join('.', 'numpy_cache')
csv_cache_dir = os.path.join('.', 'csv_cache')

### Reading - and Presenting - the Data
* Vinho Verde Red Wine; Normalized; Outlier-Free

In [3]:
rn_wine = pd.read_csv(os.path.join(csv_cache_dir, 'red_clean.csv'), sep='\t').drop(['Unnamed: 0'], axis=1)
rn_wine.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
rn_wine.describe()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
count,1194.0,1194.0,1194.0,1194.0,1194.0,1194.0,1194.0,1194.0,1194.0,1194.0,1194.0,1194.0
mean,8.166583,0.522735,0.248124,2.185762,0.078477,15.005863,42.154941,0.996571,3.324012,0.632194,10.365271,5.640704
std,1.461136,0.164879,0.180208,0.44216,0.0143,8.819079,26.084786,0.001604,0.132131,0.116196,0.975622,0.766471
min,5.1,0.12,0.0,1.2,0.041,1.0,6.0,0.99235,2.94,0.33,8.7,3.0
25%,7.1,0.39,0.08,1.9,0.069,8.0,22.0,0.995503,3.23,0.55,9.5,5.0
50%,7.8,0.52,0.24,2.1,0.078,13.0,36.0,0.9966,3.325,0.61,10.1,6.0
75%,9.0,0.63,0.39,2.5,0.087,20.0,55.75,0.997595,3.4075,0.7,11.0,6.0
max,12.3,1.005,0.73,3.6,0.119,42.0,122.0,1.001,3.68,0.98,13.4,8.0


In [5]:
def quality_labels(y):
    if y <= 4:
        return -1
    elif y <= 6:
        return 0
    else:
        return 1

In [6]:
rn_wine['quality'] = np.array(list(map(quality_labels, rn_wine['quality'])))

### Preprocessing
* Converting Numerical Data to Categorical Data through Binning into 7 buckets for each feature

In [7]:
for feature in rn_wine.iteritems():
    feature_name = feature[0]
    feature_values = feature[1]
    feature_type = feature[1].values.dtype
    
    if feature_type == 'float64':
        rn_wine[feature_name] = pd.cut(rn_wine[feature_name], 7)

In [8]:
rn_wine.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
0,"(7.157, 8.186]","(0.626, 0.752]","(-0.00073, 0.104]","(1.886, 2.229]","(0.0744, 0.0856]","(6.857, 12.714]","(22.571, 39.143]","(0.997, 0.999]","(3.469, 3.574]","(0.516, 0.609]","(9.371, 10.043]",0
1,"(7.157, 8.186]","(0.879, 1.005]","(-0.00073, 0.104]","(2.571, 2.914]","(0.0967, 0.108]","(24.429, 30.286]","(55.714, 72.286]","(0.996, 0.997]","(3.151, 3.257]","(0.609, 0.701]","(9.371, 10.043]",0
2,"(7.157, 8.186]","(0.752, 0.879]","(-0.00073, 0.104]","(2.229, 2.571]","(0.0856, 0.0967]","(12.714, 18.571]","(39.143, 55.714]","(0.996, 0.997]","(3.257, 3.363]","(0.609, 0.701]","(9.371, 10.043]",0
3,"(10.243, 11.271]","(0.246, 0.373]","(0.521, 0.626]","(1.886, 2.229]","(0.0744, 0.0856]","(12.714, 18.571]","(55.714, 72.286]","(0.997, 0.999]","(3.151, 3.257]","(0.516, 0.609]","(9.371, 10.043]",0
4,"(7.157, 8.186]","(0.626, 0.752]","(-0.00073, 0.104]","(1.886, 2.229]","(0.0744, 0.0856]","(6.857, 12.714]","(22.571, 39.143]","(0.997, 0.999]","(3.469, 3.574]","(0.516, 0.609]","(9.371, 10.043]",0


## Executing the Apriori Algorithm
* Trying to do something with the data

In [9]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [10]:
rn_wine_onehot = []

for feature in rn_wine.iteritems():
    feature_name = feature[0]
    rn_wine_onehot.append(pd.get_dummies(rn_wine[feature_name], prefix=feature_name, prefix_sep='_'))

In [11]:
for i in range(len(rn_wine_onehot)):
    try:
        rn_wine_onehot[i].columns = [str(j) for j in rn_wine_onehot[i].columns.categories]
    except AttributeError:
        rn_wine_onehot[i].columns = [str(j) for j in rn_wine_onehot[i].columns]

In [12]:
rn_wine_onehot = pd.concat(rn_wine_onehot, axis=1)

In [13]:
rn_wine_onehot

Unnamed: 0,"fixed_acidity_(5.093, 6.129]","fixed_acidity_(6.129, 7.157]","fixed_acidity_(7.157, 8.186]","fixed_acidity_(8.186, 9.214]","fixed_acidity_(9.214, 10.243]","fixed_acidity_(10.243, 11.271]","fixed_acidity_(11.271, 12.3]","volatile_acidity_(0.119, 0.246]","volatile_acidity_(0.246, 0.373]","volatile_acidity_(0.373, 0.499]",...,"alcohol_(8.695, 9.371]","alcohol_(9.371, 10.043]","alcohol_(10.043, 10.714]","alcohol_(10.714, 11.386]","alcohol_(11.386, 12.057]","alcohol_(12.057, 12.729]","alcohol_(12.729, 13.4]",quality_-1,quality_0,quality_1
0,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
1,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
2,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
3,0,0,0,0,0,1,0,0,1,0,...,0,1,0,0,0,0,0,0,1,0
4,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
5,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
6,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
7,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
8,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
9,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0


In [69]:
frequent_itemsets = apriori(rn_wine_onehot, min_support=0.01, use_colnames=True)

In [70]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

In [87]:
pd.set_option('display.max_colwidth', -1) # To make pandas print the full content of one cell
rules.sort_values(by=['lift', 'confidence'], ascending=False).head(5)

Unnamed: 0,antecedants,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
92742,"(fixed_acidity_(7.157, 8.186], sulphates_(0.516, 0.609], ph_(3.469, 3.574])","(citric_acid_(-0.00073, 0.104], quality_0, alcohol_(9.371, 10.043], density_(0.997, 0.999], chlorides_(0.0744, 0.0856])",0.010888,0.022613,0.01005,0.923077,40.820513,0.009804,12.70603
92643,"(citric_acid_(-0.00073, 0.104], quality_0, alcohol_(9.371, 10.043], density_(0.997, 0.999], chlorides_(0.0744, 0.0856])","(fixed_acidity_(7.157, 8.186], sulphates_(0.516, 0.609], ph_(3.469, 3.574])",0.022613,0.010888,0.01005,0.444444,40.820513,0.009804,1.780402
41646,"(fixed_acidity_(7.157, 8.186], sulphates_(0.516, 0.609], ph_(3.469, 3.574])","(citric_acid_(-0.00073, 0.104], alcohol_(9.371, 10.043], density_(0.997, 0.999], chlorides_(0.0744, 0.0856])",0.010888,0.024288,0.01005,0.923077,38.005305,0.009786,12.684255
92688,"(fixed_acidity_(7.157, 8.186], quality_0, sulphates_(0.516, 0.609], ph_(3.469, 3.574])","(citric_acid_(-0.00073, 0.104], alcohol_(9.371, 10.043], density_(0.997, 0.999], chlorides_(0.0744, 0.0856])",0.010888,0.024288,0.01005,0.923077,38.005305,0.009786,12.684255
41623,"(citric_acid_(-0.00073, 0.104], alcohol_(9.371, 10.043], density_(0.997, 0.999], chlorides_(0.0744, 0.0856])","(fixed_acidity_(7.157, 8.186], sulphates_(0.516, 0.609], ph_(3.469, 3.574])",0.024288,0.010888,0.01005,0.413793,38.005305,0.009786,1.687309


In [88]:
rules.loc[rules['consequents'] == frozenset({'quality_-1'})].sort_values(by=['lift', 'confidence'], ascending=False)

Unnamed: 0,antecedants,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
10228,"(citric_acid_(-0.00073, 0.104], total_sulfur_dioxide_(5.884, 22.571], free_sulfur_dioxide_(0.959, 6.857])",(quality_-1),0.042714,0.030151,0.01005,0.235294,7.803922,0.008762,1.268264
64382,"(citric_acid_(-0.00073, 0.104], free_sulfur_dioxide_(0.959, 6.857])",(quality_-1),0.048576,0.030151,0.01005,0.206897,6.862069,0.008586,1.222853
149248,"(citric_acid_(-0.00073, 0.104], total_sulfur_dioxide_(5.884, 22.571])",(quality_-1),0.086265,0.030151,0.010888,0.126214,4.186084,0.008287,1.109939
122110,"(residual_sugar_(1.886, 2.229], sulphates_(0.516, 0.609])",(quality_-1),0.120603,0.030151,0.01005,0.083333,2.763889,0.006414,1.058017
71990,"(total_sulfur_dioxide_(5.884, 22.571], free_sulfur_dioxide_(0.959, 6.857])",(quality_-1),0.159129,0.030151,0.012563,0.078947,2.618421,0.007765,1.052979
32132,"(free_sulfur_dioxide_(0.959, 6.857])",(quality_-1),0.195142,0.030151,0.0134,0.06867,2.277539,0.007517,1.041359
221330,"(citric_acid_(-0.00073, 0.104])",(quality_-1),0.298157,0.030151,0.015913,0.053371,1.770131,0.006923,1.024529
172656,"(total_sulfur_dioxide_(5.884, 22.571])",(quality_-1),0.254606,0.030151,0.0134,0.052632,1.745614,0.005724,1.02373
144526,"(residual_sugar_(1.886, 2.229])",(quality_-1),0.376884,0.030151,0.017588,0.046667,1.547778,0.006225,1.017324
117366,"(ph_(3.257, 3.363])",(quality_-1),0.340871,0.030151,0.014238,0.041769,1.38534,0.00396,1.012125


In [89]:
rules.loc[rules['consequents'] == frozenset({'quality_0'})].sort_values(by=['lift', 'confidence'], ascending=False)

Unnamed: 0,antecedants,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
1,"(sulphates_(0.516, 0.609], total_sulfur_dioxide_(22.571, 39.143], ph_(3.257, 3.363], free_sulfur_dioxide_(12.714, 18.571], chlorides_(0.0744, 0.0856])",(quality_0),0.010050,0.846734,0.010050,1.000000,1.181009,0.001540,inf
183,"(volatile_acidity_(0.373, 0.499], sulphates_(0.516, 0.609], total_sulfur_dioxide_(55.714, 72.286], chlorides_(0.0744, 0.0856])",(quality_0),0.010050,0.846734,0.010050,1.000000,1.181009,0.001540,inf
210,"(citric_acid_(-0.00073, 0.104], volatile_acidity_(0.499, 0.626], ph_(3.151, 3.257])",(quality_0),0.010888,0.846734,0.010888,1.000000,1.181009,0.001669,inf
379,"(volatile_acidity_(0.499, 0.626], free_sulfur_dioxide_(24.429, 30.286], chlorides_(0.0744, 0.0856])",(quality_0),0.011725,0.846734,0.011725,1.000000,1.181009,0.001797,inf
409,"(density_(0.997, 0.999], alcohol_(9.371, 10.043], total_sulfur_dioxide_(72.286, 88.857], chlorides_(0.0744, 0.0856])",(quality_0),0.011725,0.846734,0.011725,1.000000,1.181009,0.001797,inf
492,"(volatile_acidity_(0.499, 0.626], total_sulfur_dioxide_(88.857, 105.429], ph_(3.257, 3.363])",(quality_0),0.011725,0.846734,0.011725,1.000000,1.181009,0.001797,inf
599,"(fixed_acidity_(7.157, 8.186], volatile_acidity_(0.499, 0.626], citric_acid_(0.209, 0.313], chlorides_(0.0744, 0.0856], residual_sugar_(1.886, 2.229])",(quality_0),0.012563,0.846734,0.012563,1.000000,1.181009,0.001925,inf
887,"(residual_sugar_(1.886, 2.229], free_sulfur_dioxide_(6.857, 12.714], volatile_acidity_(0.626, 0.752], density_(0.996, 0.997])",(quality_0),0.010888,0.846734,0.010888,1.000000,1.181009,0.001669,inf
956,"(residual_sugar_(1.886, 2.229], ph_(3.257, 3.363], chlorides_(0.0744, 0.0856], alcohol_(10.043, 10.714])",(quality_0),0.010888,0.846734,0.010888,1.000000,1.181009,0.001669,inf
984,"(density_(0.994, 0.995], fixed_acidity_(5.093, 6.129])",(quality_0),0.016750,0.846734,0.016750,1.000000,1.181009,0.002567,inf


In [90]:
rules.loc[rules['consequents'] == frozenset({'quality_1'})].sort_values(by=['lift', 'confidence'], ascending=False)

Unnamed: 0,antecedants,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
204452,"(sulphates_(0.701, 0.794], alcohol_(12.057, 12.729])",(quality_1),0.012563,0.123116,0.010050,0.800000,6.497959,0.008504,4.384422
70155,"(total_sulfur_dioxide_(5.884, 22.571], volatile_acidity_(0.246, 0.373], sulphates_(0.701, 0.794])",(quality_1),0.015075,0.123116,0.010888,0.722222,5.866213,0.009032,3.156784
200166,"(volatile_acidity_(0.246, 0.373], alcohol_(12.057, 12.729])",(quality_1),0.015075,0.123116,0.010888,0.722222,5.866213,0.009032,3.156784
197812,"(total_sulfur_dioxide_(5.884, 22.571], volatile_acidity_(0.246, 0.373], alcohol_(10.714, 11.386])",(quality_1),0.014238,0.123116,0.010050,0.705882,5.733493,0.008297,2.981407
122210,"(density_(0.995, 0.996], sulphates_(0.794, 0.887], volatile_acidity_(0.246, 0.373])",(quality_1),0.016750,0.123116,0.011725,0.700000,5.685714,0.009663,2.922948
153933,"(total_sulfur_dioxide_(5.884, 22.571], volatile_acidity_(0.246, 0.373], citric_acid_(0.313, 0.417], free_sulfur_dioxide_(0.959, 6.857])",(quality_1),0.015913,0.123116,0.010888,0.684211,5.557465,0.008929,2.776801
188153,"(volatile_acidity_(0.246, 0.373], citric_acid_(0.313, 0.417], free_sulfur_dioxide_(0.959, 6.857])",(quality_1),0.016750,0.123116,0.010888,0.650000,5.279592,0.008826,2.505384
4528,"(residual_sugar_(1.886, 2.229], sulphates_(0.794, 0.887], volatile_acidity_(0.246, 0.373])",(quality_1),0.015913,0.123116,0.010050,0.631579,5.129968,0.008091,2.380115
38188,"(alcohol_(11.386, 12.057], citric_acid_(0.313, 0.417])",(quality_1),0.023451,0.123116,0.014238,0.607143,4.931487,0.011351,2.232069
64408,"(sulphates_(0.701, 0.794], density_(0.994, 0.995])",(quality_1),0.016750,0.123116,0.010050,0.600000,4.873469,0.007988,2.192211


In [22]:
rn_wine['quality'].value_counts()

 0    1011
 1     147
-1      36
Name: quality, dtype: int64