# Setup
  # P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. 
  # Modeling wine preferences by data mining from physicochemical properties.
  # In Decision Support Systems>, Elsevier, 47(4):547-553. ISSN: 0167-9236.
***

In [1]:
# Numerical arrays.
import numpy as np

# Data frames.
import pandas as pd

# Plotting.
import matplotlib.pyplot as plt

# Logistic regression.
import sklearn.linear_model as lm

# K nearest neaighbours.
import sklearn.neighbors as nei

# Helper functions.
import sklearn.model_selection as mod

# Fancier, statistical plots.
import seaborn as sns

In [2]:
#standard plot size
plt.rcParams['figure.figsize'] = (15,10)

# Standard colour scheme.
plt.style.use('ggplot')

In [3]:
# Load in the red wine data from the url
df = pd.read_csv("https://raw.githubusercontent.com/John-Dubber/EmergingTechnologiesAssessment/main/winequality/winequality-red.csv")

In [4]:
# Check out what the data looks like
df

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [5]:
# Summary statistics
df.describe()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


# Visualisation
***

In [None]:
# Scatterplots and kdes
sns.pairplot(df, hue='quality');

# Two Dimensions
***

In [None]:
# New figure
fig, ax = plt.subplots()

#scatter plot
ax.plot(df['chlorides'], df['alcohol'],'.')

# Set axis labels
ax.set_xlabel('Chlorides');
ax.set_ylabel('Alcohol');

In [None]:
#seaborn is great for creating complex plots with one command
sns.lmplot(x="pH", y="alcohol", hue='quality', data=df, fit_reg=False, height = 10 , aspect= 1.5);

# Using pyplot
***

In [None]:
# Segregate the data
quality3 = df[df['quality'] == 3]
quality4 = df[df['quality'] == 4]
quality5 = df[df['quality'] == 5]
quality6 = df[df['quality'] == 6]
quality7 = df[df['quality'] == 7]
quality8 = df[df['quality'] == 8]

# New PLot
fig, ax = plt.subplots()

# Scatter plots
ax.scatter(quality3['alcohol'], quality3['pH'], label = 3)
ax.scatter(quality4['alcohol'], quality4['pH'], label = 4)
ax.scatter(quality5['alcohol'], quality5['pH'], label = 5)
ax.scatter(quality6['alcohol'], quality6['pH'], label = 6)
ax.scatter(quality7['alcohol'], quality7['pH'], label = 7)
ax.scatter(quality8['alcohol'], quality8['pH'], label = 8)

# Show the legend
ax.set_xlabel('Alcohol')
ax.set_ylabel('pH')
ax.legend();


In [None]:
# How the segregation works.
df['quality'] == 3

In [None]:
df[df['quality'] == 3].head()

# Using groupby()
***

In [None]:
# New Plot
fig, ax = plt.subplots()

#Using pandas groupby()
for quality, data in df.groupby('quality'):
    ax.scatter(data['alcohol'], data['pH'], label=quality)

# Show the legend
ax.set_xlabel('alcohol')
ax.set_ylabel('pH')
ax.legend();

In [None]:
# Group by typically takes a categorical variable
x = df.groupby('quality')
x

In [None]:
x.mean()

In [None]:
# Looping through groupby()
for i, j in x:
    print()
    print(f"i is: '{i}'")
    print(f"j looks like:\n{j[:3]}")
    print()

# Test and Train Split#
***

In [None]:
# Split the data frame in two.
train, test = mod.train_test_split(df)

In [None]:
# Show some training data.
train.head()

In [None]:
# The indices of the train array
train.index

In [None]:
test.index.size

# Two Dimensions: Test Train Split
***

In [None]:
# Segregate the data
quality3 = train[train['quality'] == 3]
quality4 = train[train['quality'] == 4]
quality5 = train[train['quality'] == 5]
quality6 = train[train['quality'] == 6]
quality7 = train[train['quality'] == 7]
quality8 = train[train['quality'] == 8]

# New PLot
fig, ax = plt.subplots()

# Scatter plots for training data
ax.scatter(quality3['alcohol'], quality3['pH'], marker = 'o', label = 3)
ax.scatter(quality4['alcohol'], quality4['pH'], marker = 'o', label = 4)
ax.scatter(quality5['alcohol'], quality5['pH'], marker = 'o', label = 5)
ax.scatter(quality6['alcohol'], quality6['pH'], marker = 'o', label = 6)
ax.scatter(quality7['alcohol'], quality7['pH'], marker = 'o', label = 7)
ax.scatter(quality8['alcohol'], quality8['pH'], marker = 'o', label = 8)

#Scatter plot for testing data
ax.scatter(test['alcohol'], test['pH'], marker='x', label = 'Test data')

# Show the legend
ax.set_xlabel('Alcohol')
ax.set_ylabel('pH')
ax.legend();

# Two Dimensions: Inputs and Outputs
***

In [None]:
# Give the inputs and outputs convenient names
inputs, outputs = train[['alcohol','pH']], train['quality']

In [None]:
# Peek at the inputs
inputs.head()

In [None]:
# Peek at the outputs
outputs.head()

# Two Dimensions: Logistic regression
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
***

In [None]:
# Create a new classifier
lre = lm.LogisticRegression(random_state=0)

# Train the classifier on our data
lre.fit(inputs, outputs)

In [None]:
# Ask the classifier to classify the test data
predictions = lre.predict(test[['alcohol','pH']])
predictions

In [None]:
predictions == test['quality']

In [None]:
lre.score(test[['alcohol','pH']], test['quality'])

# Two Dimensions: Misclassified
***

In [None]:
# Append a column to the test data frame with the predictions.
test['predicted'] = predictions
test.head()

In [None]:
# Show the misclassified data.
misclass = test[test['predicted'] != test['quality']]
misclass

In [None]:
# Eyeball the descriptive statistics for the species.
train.groupby('quality').mean()

In [None]:
# New plot.
fig, ax = plt.subplots()

# Plot the training data
for quality, data in df.groupby('quality'):
    ax.scatter(data['alcohol'], data['pH'], label=quality)
    
# Plot misclassified.
ax.scatter(misclass['alcohol'], misclass['pH'], s=200, facecolor='none', edgecolor='r', label='Misclassified')

# Show the legend.
ax.set_xlabel('Alcohol')
ax.set_ylabel('pH')
ax.legend();

# Using all Possible Inputs
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
***

In [None]:
# Load in the red wine data from the url
df = pd.read_csv("https://raw.githubusercontent.com/John-Dubber/EmergingTechnologiesAssessment/main/winequality/winequality-red.csv")

In [None]:
# Split the data frame in two
train, test = mod.train_test_split(df)

In [None]:
# Use all possible inputs
inputs, outputs = train[['fixed_acidity','volatile_acidity','citric_acid','residual_sugar','chlorides','free_sulfur_dioxide','density','pH','sulphates','alcohol']], train['quality']

In [None]:
# Create a new classifier
lre = lm.LogisticRegression(random_state=0)

# Train the classifier on the red wine data
lre.fit(inputs, outputs)

In [None]:
# Ask the classifier to classify the test data
predictions = lre.predict(test[['fixed_acidity','volatile_acidity','citric_acid','residual_sugar','chlorides','free_sulfur_dioxide','density','pH','sulphates','alcohol']])
predictions

In [None]:
# Eyeball the misclassifications
(predictions == test['quality']).value_counts()

In [None]:
# What proportion were correct?
lre.score(test[['fixed_acidity','volatile_acidity','citric_acid','residual_sugar','chlorides','free_sulfur_dioxide','density','pH','sulphates','alcohol']],test['quality'])

# K Nearest Neighbours Classifier
https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
***

In [None]:
# Load in the red wine data from the url
df = pd.read_csv("https://raw.githubusercontent.com/John-Dubber/EmergingTechnologiesAssessment/main/winequality/winequality-red.csv")

In [None]:
# Split the data frame in two
train, test = mod.train_test_split(df)

In [None]:
# Use all possible inputs
inputs, outputs = train[['fixed_acidity','volatile_acidity','citric_acid','residual_sugar','chlorides','free_sulfur_dioxide','density','pH','sulphates','alcohol']], train['quality']

In [None]:
# Classifier
knn = nei.KNeighborsClassifier()

In [None]:
# Fit
knn.fit(inputs, outputs)

In [None]:
# Test
knn.score(test[['fixed_acidity','volatile_acidity','citric_acid','residual_sugar','chlorides','free_sulfur_dioxide','density','pH','sulphates','alcohol']], test['quality'])

In [None]:
# Predict.
predictions = lre.predict(test[['fixed_acidity','volatile_acidity','citric_acid','residual_sugar','chlorides','free_sulfur_dioxide','density','pH','sulphates','alcohol']])
(predictions == test['quality']).value_counts()

In [None]:
# The score is just the accuracy in this case
(predictions == test['quality']).value_counts(normalize=True)

# Conclusion
***

I believe that there are other factors at play interfering with the data to cause such deviation.
Perhaps it is because quality in this case is a preceved quality and is not scientifically measurable.
The other is that further data is required to improve the accuracy. Considering the number of factors at play and range of qualities a 60% accuracy is probably better than I can tell wine quality apart

***
# End