## Importing Essential Libraries and Modules

In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

### Defining Cache Directories

In [2]:
np_cache_dir = os.path.join('.', 'numpy_cache')
csv_cache_dir = os.path.join('.', 'csv_cache')

### Reading - and Presenting - the Data
* Vinho Verde Red Wine; Normalized; Outlier-Free

In [3]:
rn_wine = pd.read_csv(os.path.join(csv_cache_dir, 'red_clean.csv'), sep='\t').drop(['Unnamed: 0'], axis=1)
rn_wine.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
rn_wine.describe()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
count,1194.0,1194.0,1194.0,1194.0,1194.0,1194.0,1194.0,1194.0,1194.0,1194.0,1194.0,1194.0
mean,8.166583,0.522735,0.248124,2.185762,0.078477,15.005863,42.154941,0.996571,3.324012,0.632194,10.365271,5.640704
std,1.461136,0.164879,0.180208,0.44216,0.0143,8.819079,26.084786,0.001604,0.132131,0.116196,0.975622,0.766471
min,5.1,0.12,0.0,1.2,0.041,1.0,6.0,0.99235,2.94,0.33,8.7,3.0
25%,7.1,0.39,0.08,1.9,0.069,8.0,22.0,0.995503,3.23,0.55,9.5,5.0
50%,7.8,0.52,0.24,2.1,0.078,13.0,36.0,0.9966,3.325,0.61,10.1,6.0
75%,9.0,0.63,0.39,2.5,0.087,20.0,55.75,0.997595,3.4075,0.7,11.0,6.0
max,12.3,1.005,0.73,3.6,0.119,42.0,122.0,1.001,3.68,0.98,13.4,8.0


In [5]:
rn_wine_X = np.load(os.path.join(np_cache_dir, 'red_clean_X_normed.npy'))
rn_wine_y = np.load(os.path.join(np_cache_dir, 'red_clean_y.npy'))

In [None]:
def quality_labels(y):
    if y <= 4:
        return -1
    elif y <= 6:
        return 0
    else:
        return 1

In [None]:
rn_wine_y = np.array(list(map(quality_labels, rn_wine_y)))

### Preprocessing
* Converting Numerical Data to Categorical Data through Binning into 7 buckets for each feature

In [6]:
for feature in rn_wine.iteritems():
    feature_name = feature[0]
    feature_values = feature[1]
    feature_type = feature[1].values.dtype
    
    if feature_type == 'float64':
        rn_wine[feature_name] = pd.cut(rn_wine[feature_name], 7)

In [7]:
rn_wine.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
0,"(7.157, 8.186]","(0.626, 0.752]","(-0.00073, 0.104]","(1.886, 2.229]","(0.0744, 0.0856]","(6.857, 12.714]","(22.571, 39.143]","(0.997, 0.999]","(3.469, 3.574]","(0.516, 0.609]","(9.371, 10.043]",5
1,"(7.157, 8.186]","(0.879, 1.005]","(-0.00073, 0.104]","(2.571, 2.914]","(0.0967, 0.108]","(24.429, 30.286]","(55.714, 72.286]","(0.996, 0.997]","(3.151, 3.257]","(0.609, 0.701]","(9.371, 10.043]",5
2,"(7.157, 8.186]","(0.752, 0.879]","(-0.00073, 0.104]","(2.229, 2.571]","(0.0856, 0.0967]","(12.714, 18.571]","(39.143, 55.714]","(0.996, 0.997]","(3.257, 3.363]","(0.609, 0.701]","(9.371, 10.043]",5
3,"(10.243, 11.271]","(0.246, 0.373]","(0.521, 0.626]","(1.886, 2.229]","(0.0744, 0.0856]","(12.714, 18.571]","(55.714, 72.286]","(0.997, 0.999]","(3.151, 3.257]","(0.516, 0.609]","(9.371, 10.043]",6
4,"(7.157, 8.186]","(0.626, 0.752]","(-0.00073, 0.104]","(1.886, 2.229]","(0.0744, 0.0856]","(6.857, 12.714]","(22.571, 39.143]","(0.997, 0.999]","(3.469, 3.574]","(0.516, 0.609]","(9.371, 10.043]",5


## Executing the Apriori Algorithm
* Trying to do something with the data

In [23]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [18]:
rn_wine_onehot = []

for feature in rn_wine.iteritems():
    feature_name = feature[0]
    rn_wine_onehot.append(pd.get_dummies(rn_wine[feature_name], prefix=feature_name, prefix_sep='_'))

In [19]:
for i in range(len(rn_wine_onehot)):
    try:
        rn_wine_onehot[i].columns = [str(j) for j in rn_wine_onehot[i].columns.categories]
    except AttributeError:
        rn_wine_onehot[i].columns = [str(j) for j in rn_wine_onehot[i].columns]

In [13]:
rn_wine_onehot[0]

Unnamed: 0,"fixed_acidity_(5.093, 6.129]","fixed_acidity_(6.129, 7.157]","fixed_acidity_(7.157, 8.186]","fixed_acidity_(8.186, 9.214]","fixed_acidity_(9.214, 10.243]","fixed_acidity_(10.243, 11.271]","fixed_acidity_(11.271, 12.3]"
0,0,0,1,0,0,0,0
1,0,0,1,0,0,0,0
2,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0
4,0,0,1,0,0,0,0
5,0,0,1,0,0,0,0
6,0,0,1,0,0,0,0
7,0,0,1,0,0,0,0
8,0,0,1,0,0,0,0
9,0,1,0,0,0,0,0


In [20]:
type(rn_wine_onehot)

list

In [21]:
rn_wine_onehot = pd.concat(rn_wine_onehot, axis=1)

In [22]:
rn_wine_onehot

Unnamed: 0,"fixed_acidity_(5.093, 6.129]","fixed_acidity_(6.129, 7.157]","fixed_acidity_(7.157, 8.186]","fixed_acidity_(8.186, 9.214]","fixed_acidity_(9.214, 10.243]","fixed_acidity_(10.243, 11.271]","fixed_acidity_(11.271, 12.3]","volatile_acidity_(0.119, 0.246]","volatile_acidity_(0.246, 0.373]","volatile_acidity_(0.373, 0.499]",...,"alcohol_(10.714, 11.386]","alcohol_(11.386, 12.057]","alcohol_(12.057, 12.729]","alcohol_(12.729, 13.4]",quality_3,quality_4,quality_5,quality_6,quality_7,quality_8
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
6,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
7,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
8,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [107]:
frequent_itemsets = apriori(rn_wine_onehot, min_support=0.01, use_colnames=True)

In [108]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=0)

In [121]:
rules.sort_values(by=['support', 'confidence'], ascending=False)

Unnamed: 0,antecedants,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
142685,"(alcohol_(9.371, 10.043])",(quality_5),0.370184,0.429648,0.232831,0.628959,1.463894,0.073782,1.537168
142684,(quality_5),"(alcohol_(9.371, 10.043])",0.429648,0.370184,0.232831,0.541910,1.463894,0.073782,1.374874
53417,"(sulphates_(0.516, 0.609])",(quality_5),0.346734,0.429648,0.177554,0.512077,1.191852,0.028581,1.168939
53416,(quality_5),"(sulphates_(0.516, 0.609])",0.429648,0.346734,0.177554,0.413255,1.191852,0.028581,1.113374
118118,"(residual_sugar_(1.886, 2.229])",(quality_6),0.376884,0.417085,0.175042,0.464444,1.113548,0.017849,1.088430
118119,(quality_6),"(residual_sugar_(1.886, 2.229])",0.417085,0.376884,0.175042,0.419679,1.113548,0.017849,1.073742
38788,"(density_(0.996, 0.997])",(quality_5),0.317420,0.429648,0.163317,0.514512,1.197519,0.026937,1.174801
96452,"(chlorides_(0.0744, 0.0856])","(alcohol_(9.371, 10.043])",0.346734,0.370184,0.163317,0.471014,1.272379,0.034961,1.190611
96453,"(alcohol_(9.371, 10.043])","(chlorides_(0.0744, 0.0856])",0.370184,0.346734,0.163317,0.441176,1.272379,0.034961,1.169003
38789,(quality_5),"(density_(0.996, 0.997])",0.429648,0.317420,0.163317,0.380117,1.197519,0.026937,1.101143


In [124]:
rules.loc[rules['consequents'] == frozenset({'quality_7'})].sort_values(by=['support', 'confidence'], ascending=False)

Unnamed: 0,antecedants,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
72960,"(volatile_acidity_(0.246, 0.373])",(quality_7),0.182580,0.113065,0.054439,0.298165,2.637105,0.033795,1.263737
67313,"(total_sulfur_dioxide_(5.884, 22.571])",(quality_7),0.254606,0.113065,0.045226,0.177632,1.571053,0.016439,1.078513
21742,"(ph_(3.257, 3.363])",(quality_7),0.340871,0.113065,0.043551,0.127764,1.130003,0.005010,1.016852
35237,"(citric_acid_(0.313, 0.417])",(quality_7),0.145729,0.113065,0.040201,0.275862,2.439847,0.023724,1.224815
22544,"(sulphates_(0.701, 0.794])",(quality_7),0.134841,0.113065,0.038526,0.285714,2.526984,0.023280,1.241709
146619,"(total_sulfur_dioxide_(22.571, 39.143])",(quality_7),0.305695,0.113065,0.037688,0.123288,1.090411,0.003125,1.011660
125320,"(density_(0.995, 0.996])",(quality_7),0.240369,0.113065,0.036851,0.153310,1.355943,0.009674,1.047532
120776,"(chlorides_(0.0633, 0.0744])",(quality_7),0.237018,0.113065,0.033501,0.141343,1.250098,0.006702,1.032932
28420,"(free_sulfur_dioxide_(6.857, 12.714])",(quality_7),0.269682,0.113065,0.033501,0.124224,1.098689,0.003009,1.012741
56307,"(residual_sugar_(1.886, 2.229])",(quality_7),0.376884,0.113065,0.033501,0.088889,0.786173,-0.009112,0.973465


In [129]:
rn_wine['quality'].value_counts()

5    513
6    498
7    135
4     33
8     12
3      3
Name: quality, dtype: int64