In [47]:
import numpy as np
import matplotlib.pyplot as plt
from math import *
import pandas as pd
from csv import reader
from csv import writer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
# method for loading each row of data in .csv file into a list
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset

# load data for red wine
full_data_r = load_csv('winequality-red.csv')
full_data_r.pop(0)

# load data for white wine

full_data_w = load_csv('winequality-white.csv')
full_data_w.pop(0)

#print(full_data_r)
#print(full_data_w)

['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'quality']

In [26]:
df_r = pd.read_csv('winequality-red.csv')
actual_r = df_r.quality #you can also use df['column_name']
actual_r = np.asarray(actual_r)
actual_r_test = actual_r[round(len(actual_r)*0.8):]
print(len(actual_r_test))
print(actual_r)

df_w = pd.read_csv('winequality-white.csv')
actual_w = df_w.quality
actual_w = np.asarray(actual_w)
actual_w_test = actual_w[round(len(actual_w)*0.8):]
print(len(actual_w_test))
print(actual_w)

320
[5 5 5 ... 6 5 6]
980
[6 6 6 ... 6 7 6]


In [27]:
# Convert all all string values to float values

# method for converting string column to float
def str_column_to_float(dataset, column):
    for row in dataset:
        r = row[column].strip()
        row[column] = float(r)

# red wine
for i in range(len(full_data_r[0])):
    str_column_to_float(full_data_r, i)
    
# white wine
for i in range(len(full_data_w[0])):
    str_column_to_float(full_data_w, i)

AttributeError: 'float' object has no attribute 'strip'

In [28]:
# Split data

# red wine
r_train = full_data_r[:round(len(full_data_r)*0.6)]
r_valid = full_data_r[round(len(full_data_r)*0.6):round(len(full_data_r)*0.8)]
r_test  = full_data_r[round(len(full_data_r)*0.8):]  

# white wine
w_train = full_data_w[:round(len(full_data_w)*0.6)]
w_valid = full_data_w[round(len(full_data_w)*0.6):round(len(full_data_w)*0.8)]
w_test  = full_data_w[round(len(full_data_w)*0.8):]  

In [29]:
# method to split the dataset by class values, returns a dictionary
def separate_by_class(dataset):
    separated = dict()
    for i in range(len(dataset)):
        vector = dataset[i]
        class_value = vector[-1]
        if (class_value not in separated):
            separated[class_value] = list()
        separated[class_value].append(vector)
    return separated


In [30]:
# 1. seperate the training data by class 
separated_r_train = separate_by_class(r_train)
separated_w_train = separate_by_class(w_train)

# print red keys
print('red wine keys: ', separated_r_train.keys())

# print white keys
print('white wine keys: ', separated_w_train.keys())

# red wine: print key, then all values associated with key
#for label in separated_r_train:
#    print(label)
#    for row in separated_r_train[label]:
#        print(row)
        
# white wine: print key, then all values associated with key
#for label in separated_w_train:
#    print(label)
#    for row in separated_w_train[label]:
#        print(row)

red wine keys:  dict_keys([5.0, 6.0, 7.0, 4.0, 8.0, 3.0])
white wine keys:  dict_keys([6.0, 5.0, 7.0, 8.0, 4.0, 3.0, 9.0])


In [31]:
# Calculate the mean of a list of numbers
mean = lambda x: sum(x)/float(len(x))
 
# Calculate the standard deviation of a list of numbers
def stdev(numbers):
	avg = mean(numbers)
	variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
	return sqrt(variance)
 
# Method for calculating the mean, stdev and count for each column in a dataset
# zip() function maps the similar index of multiple containes, so that they can be used as a single entity
# * operator unpacks dataset i.e. transforms list of lists --> separate lists for each row 
# we delete the last column, as this is the column we are trying to predict(do not need summary statistics for it)
def summarize_dataset(dataset):
    summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
    del(summaries[-1])
    return summaries

In [32]:
# 2. Summary of whole training data , printed : mean, standard deviation, count
# has 11 columns output as 11 rows repr. the input features

# red wine
print('Whole data summary Red Wine: ')
summaries_rTrain = summarize_dataset(r_train)
for col in summaries_rTrain:
    print(col)
    
print('\n')
    
# white wine
print('Whole data summary White Wine: ')
summaries_wTrain = summarize_dataset(w_train)
for col in summaries_wTrain:
    print(col)

Whole data summary Red Wine: 
(8.746819603753917, 1.8470163831475053, 959)
(0.5308915537017724, 0.17934877767009977, 959)
(0.295505735140772, 0.20172774009971367, 959)
(2.599635036496351, 1.252111941252337, 959)
(0.0908706986444212, 0.050816999097932886, 959)
(15.150156412930135, 9.885929641746575, 959)
(48.91240875912409, 33.35731527873884, 959)
(0.99740212721585, 0.001773835664600891, 959)
(3.298759124087591, 0.15820371950759535, 959)
(0.6709176225234631, 0.1849837562472551, 959)
(10.226068821689248, 1.025320500560836, 959)


Whole data summary White Wine: 
(7.0527560394692, 0.8598155239476267, 2939)
(0.27848247703300494, 0.1028092999708983, 2939)
(0.34727458319156046, 0.12787532939437987, 2939)
(6.3625042531473435, 5.18310957864615, 2939)
(0.04675263695134447, 0.022939076139044424, 2939)
(35.361007145287516, 16.766749359288823, 2939)
(144.6019054100034, 44.01587733109399, 2939)
(0.9945698843143869, 0.0029154281376069504, 2939)
(3.207553589656345, 0.1547946116207012, 2939)
(0.4915719

In [33]:
# 3. Summarize data according to class value

# Method for summarizing the columns in the dataset, organized by class values
# Utilises separate_by_class() and summarize_dataset() functions
# Split dataset by class then calculate statistics for each row
def summarize_by_class(dataset):
    separated = separate_by_class(dataset)
    summaries = dict()
    for class_value, rows in separated.items():           # separated.item() returns key:value pairs from dictionary
        summaries[class_value] = summarize_dataset(rows)
    return summaries

In [34]:
# Summary of red wine training data according to class values
summary_rTrain = summarize_by_class(r_train)

for class_value in summary_rTrain:
    print(class_value)
    for row in summary_rTrain[class_value]:
        print(row)

5.0
(8.436888888888886, 1.6475450079825997, 450)
(0.574333333333333, 0.16680944144445528, 450)
(0.26382222222222185, 0.185862378007291, 450)
(2.56822222222222, 1.2360270949540404, 450)
(0.09362444444444448, 0.05415468094221833, 450)
(16.155555555555555, 10.323402081532791, 450)
(59.55111111111111, 38.28143712828427, 450)
(0.9975300444444442, 0.0014760189935396411, 450)
(3.2998444444444415, 0.15363950742894597, 450)
(0.6344000000000009, 0.18050713020114204, 450)
(9.803333333333338, 0.6747844661251698, 450)
6.0
(8.926436781609205, 1.897956417759708, 348)
(0.49590517241379395, 0.15974664623025606, 348)
(0.3038218390804595, 0.20069057257224265, 348)
(2.5722701149425298, 1.2853137481371693, 348)
(0.09036206896551728, 0.04487222908723698, 348)
(14.718390804597702, 9.052664087771323, 348)
(41.56896551724138, 25.021221242605847, 348)
(0.9974750287356317, 0.0018477724728883141, 348)
(3.298735632183912, 0.15931629568135208, 348)
(0.6910919540229886, 0.17954300824113026, 348)
(10.36781609195402, 

In [35]:
# Summary of white wine training data according to class values
summary_wTrain = summarize_by_class(w_train)

for class_value in summary_wTrain:
    print(class_value)
    for row in summary_wTrain[class_value]:
        print(row)

6.0
(7.062530612244898, 0.8585047207066043, 1225)
(0.25771428571428534, 0.08673451341302094, 1225)
(0.3526938775510212, 0.12515647136123706, 1225)
(6.231714285714291, 5.248373573155662, 1225)
(0.046645714285714034, 0.021176241465717602, 1225)
(36.11387755102041, 15.648726797841778, 1225)
(146.23142857142858, 42.7148370466465, 1225)
(0.9945495591836726, 0.0029693499324277162, 1225)
(3.215265306122445, 0.1547783012214937, 1225)
(0.4955020408163267, 0.11205127008733357, 1225)
(10.293306122448977, 1.0178412975653315, 1225)
5.0
(7.0679792746114085, 0.8633114137322545, 965)
(0.3026891191709849, 0.10442863083226436, 965)
(0.3479274611398963, 0.14811349985919314, 965)
(7.551813471502594, 5.418332214811795, 965)
(0.05115129533678733, 0.02730536385226524, 965)
(36.33782383419689, 18.239413726918773, 965)
(153.280310880829, 45.61756875088177, 965)
(0.9956229948186543, 0.0024766847748632676, 965)
(3.1760207253886015, 0.1451059120473438, 965)
(0.4763108808290157, 0.10116062335506026, 965)
(9.701347

In [36]:
# 4. Using the Gaussian Probability Density Function

In [37]:
# Method to calculate the Gaussian Probability Distribution Function
def calc_Gaussian_pdf(x, mean, std_dev):
    exponent = exp(-((x-mean)**2 / (2 * std_dev**2 )))
    pdf = (1 / (sqrt(2 * pi) * std_dev)) * exponent
    return pdf

In [38]:
# 5. Determining Class Probabilities
# will use statistics calculated from our training data to calculate probabilities for new data


In [39]:
# Calculate the probabilities of predicting each class for a given row
def calculate_class_probabilities(summaries, row):
    total_rows = sum([summaries[label][0][2] for label in summaries])
    probabilities = dict()
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
        for i in range(len(class_summaries)):
            mean, stdev, _ = class_summaries[i]
            probabilities[class_value] *= calc_Gaussian_pdf(row[i], mean, stdev)
    return probabilities
 
# Predict the class for a given row
def predict(summaries, row):
    probabilities = calculate_class_probabilities(summaries, row)
    best_label, best_prob = None, -1
    for class_value, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = class_value
    return best_label, probabilities
 
# Naive Bayes Algorithm
def naive_bayes(train, test):
    summarize = summarize_by_class(train)
    predictions = list()
    for row in test:
        output, prob = predict(summarize, row)
        predictions.append(output)
    return prob, predictions

In [40]:
# Naive Bayes on red wine

#some summary statistics for red wine on the training data
total_rows_red = len(r_train)
print('red wine summary statistics for r_train')
print('\n')
print('Number of rows in r_train: ', total_rows_red)

for class_value, class_summaries in summary_rTrain.items():
    print('\n')
    print('class_value: ', class_value)
    print('Number of different instances corresponding to class_value: ', summary_rTrain[class_value][0][2]) 
    
prob_red_train, predictions_red_test = naive_bayes(r_train, r_test)

print('\n')
print('Class Probabilites for Red Wine:', prob_red_train)
print('\n')
print(predictions_red_test)

red wine summary statistics for r_train


Number of rows in r_train:  959


class_value:  5.0
Number of different instances corresponding to class_value:  450


class_value:  6.0
Number of different instances corresponding to class_value:  348


class_value:  7.0
Number of different instances corresponding to class_value:  114


class_value:  4.0
Number of different instances corresponding to class_value:  32


class_value:  8.0
Number of different instances corresponding to class_value:  10


class_value:  3.0
Number of different instances corresponding to class_value:  5


Class Probabilites for Red Wine: {5.0: 0.0009741624777621151, 6.0: 0.008616264267641711, 7.0: 0.008304051339651133, 4.0: 4.644628686237231e-05, 8.0: 0.0009125084397441425, 3.0: 1.043519390626935e-06}


[7.0, 5.0, 5.0, 6.0, 5.0, 6.0, 6.0, 7.0, 7.0, 5.0, 5.0, 6.0, 6.0, 8.0, 5.0, 6.0, 5.0, 5.0, 7.0, 7.0, 4.0, 8.0, 6.0, 7.0, 6.0, 5.0, 5.0, 5.0, 6.0, 5.0, 5.0, 5.0, 7.0, 4.0, 5.0, 5.0, 5.0, 4.0, 7.0, 5.0, 4.0, 5.0, 4.0, 

In [41]:
# Naive Bayes on White Wine

#some summary statistics for white wine on the training data
total_rows_white = len(w_train)
print('red wine summary statistics for r_train')
print('\n')
print('Number of rows in r_train: ', total_rows_white)

for class_value, class_summaries in summary_wTrain.items():
    print('\n')
    print('class_value: ', class_value)
    print('Number of different instances corresponding to class_value: ', summary_wTrain[class_value][0][2]) 
    
prob_white_train, predictions_white_test = naive_bayes(w_train, w_test)

print('\n')
print('Class Probabilites for White Wine:', prob_white_train)
print('\n')
print(predictions_white_test)

red wine summary statistics for r_train


Number of rows in r_train:  2939


class_value:  6.0
Number of different instances corresponding to class_value:  1225


class_value:  5.0
Number of different instances corresponding to class_value:  965


class_value:  7.0
Number of different instances corresponding to class_value:  506


class_value:  8.0
Number of different instances corresponding to class_value:  107


class_value:  4.0
Number of different instances corresponding to class_value:  117


class_value:  3.0
Number of different instances corresponding to class_value:  14


class_value:  9.0
Number of different instances corresponding to class_value:  5


Class Probabilites for White Wine: {6.0: 0.0003584112022913747, 5.0: 2.080037746258919e-06, 7.0: 0.00240443459681223, 8.0: 0.0010825344099670768, 4.0: 2.280840102755608e-06, 3.0: 2.722165448262912e-07, 9.0: 0.0003160992980581143}


[7.0, 7.0, 7.0, 5.0, 7.0, 6.0, 6.0, 6.0, 6.0, 7.0, 5.0, 5.0, 7.0, 7.0, 7.0, 6.0, 7.0, 5.0, 7.0, 3.

In [48]:
# Confusion Matrices

# red wine
conf_mat_r = confusion_matrix(actual_r_test, predictions_red_test)
accuracy_r = accuracy_score(actual_r_test, predictions_red_test)
print('Confusion Matrix for Red Wine:')
print(conf_mat_r)
print('Accuracy of Red Wine Predictions: ', accuracy_r)

print('\n')

# white wine
conf_mat_w = confusion_matrix(actual_w_test, predictions_white_test)
accuracy_w = accuracy_score(actual_w_test, predictions_white_test)
print('Confusion Matrix for White Wine:')
print(conf_mat_w)
print('Accuracy of White Wine Predictions: ', accuracy_w)

Confusion Matrix for Red Wine:
[[ 0  2  3  0  0  0]
 [ 0  2  5  4  0  0]
 [ 0  3 97 38  5  0]
 [ 0  8 37 66 22  3]
 [ 0  0  3  8  9  2]
 [ 0  0  0  1  2  0]]
Accuracy of Red Wine Predictions:  0.54375


Confusion Matrix for White Wine:
[[  1   0   0   0   0   0   0]
 [  0   8   8   3   5   1   0]
 [  4   6 139  82  35   0   0]
 [  1   7 102 122 277   8   0]
 [  0   0  18  15 110   7   1]
 [  0   0   8   1  10   0   1]
 [  0   0   0   0   0   0   0]]
Accuracy of White Wine Predictions:  0.3877551020408163
