In [None]:
%matplotlib inline 
# make your plot outputs appear and be stored within the notebook
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()  # for plot styling
import numpy as np
import pandas as pd

In [None]:
# read in data
dfred= pd.read_csv("winequality-red.csv", sep=';')
print(dfred.head(10))
dfred.info()

In [None]:
# scale the dataset to give them all equal importance. Scaling is very important for a clustering analysis as the distance between points affects the way cluster are formed.
from sklearn.preprocessing import StandardScaler

df = pd.DataFrame(StandardScaler().fit_transform(dfred), index=dfred.index, columns=dfred.columns)

df.head()

In [None]:
dfred['quality'].describe()

In [None]:
df['quality'].describe() # the range of transformed quality is from -3.27 to 2.93

In [None]:
##correlation plot 
corr = df.corr()

In [None]:
##plot source "https://seaborn.pydata.org/examples/many_pairwise_correlations.html"

sns.set(style='white')

#generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=np.bool))

#set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

#generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

#draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, vmin=-1, vmax=1, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5}, cmap=cmap)


From the corrolation plot, we could find that the quality is highly positively correlated with alcohol; and negatively correlated with density and volatile acidity.

## 1. Use K Means Cluster Analysis to identify cluster(s) of observations that have high and low values of the wine quality. (Assume all variables are continuous.) Describe variables that cluster with higher values of wine quality. Describe variables that cluster with lower values of wine quality. If you want to make a good bottle of wine, then what characteristics are most important according to this analysis?

"https://towardsdatascience.com/clustering-with-k-means-1e07a8bfb7ca"

In [None]:
from sklearn.cluster import KMeans

In [None]:
## create the elbow plot to choose the number of clusters

with_cluster_variation = []

for k in range(2, 12):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(df)
    with_cluster_variation.append(kmeans.inertia_)


fig = plt.figure(figsize=(15, 5))
plt.plot(range(2, 12), with_cluster_variation, marker = 'o')
plt.grid(True) #create the grid in figure
plt.xlabel('Number of clusters')
plt.ylabel('Reduction in within cluster variation')
plt.title('Elbow Plot')

There is no clear elbow on this plot. But I think 7 clusters may be reasonable as after 7 clusters, the variation deduction trend slows down.

In [None]:
def pd_centers(cols_of_interest, centers):
    colNames = list(cols_of_interest)
    colNames.append('prediction')
    
    #zip with a column called 'prediction' (index)
    Z = [np.append(A, index) for index, A in enumerate(centers)]
    
    #convert to pandas dataframe for plotting
    p = pd.DataFrame(Z, columns=colNames)
    p['prediction'] = p['prediction'].astype(int)
    return p

In [None]:
## use 7 clusters
kmean_7 = KMeans(n_clusters=7).fit(df)
kmean_result = pd_centers(df.columns, kmean_7.cluster_centers_)

In [None]:
kmean_result

In [None]:
kmean_7_labels = kmean_7.predict(df)

In [None]:
set(kmean_7_labels) # the unique label is from 0 to 6, which means there is 7 clusters.

In [None]:
plt.scatter(df.values[kmean_7_labels==0, 0], df.values[kmean_7_labels==0, 1], s=50, marker='o', color='red', label='cluster 1')
plt.scatter(df.values[kmean_7_labels==1, 0], df.values[kmean_7_labels==1, 1], s=50, marker='o', color='blue', label = 'cluster 2')
plt.scatter(df.values[kmean_7_labels==2, 0], df.values[kmean_7_labels==2, 1], s=50, marker='o', color='green', label = 'cluster 3')
plt.scatter(df.values[kmean_7_labels==3, 0], df.values[kmean_7_labels==3, 1], s=50, marker='o', color='purple', label = 'cluster 4')
plt.scatter(df.values[kmean_7_labels==4, 0], df.values[kmean_7_labels==4, 1], s=50, marker='o', color='orange', label = 'cluster 5')
plt.scatter(df.values[kmean_7_labels==5, 0], df.values[kmean_7_labels==5, 1], s=50, marker='o', color='gray', label = 'cluster 6')
plt.scatter(df.values[kmean_7_labels==6, 0], df.values[kmean_7_labels==6, 1], s=50, marker='o', color='yellow', label = 'cluster 7')
plt.legend()
plt.title('KMeans Clustering')
plt.show()

From the table above, we could find that:

1. high quality positively associtated with alcohol;
2. high value of density associate with low quality; low density associate with high quality
3. high value of volatile acidity associate with low quality; low volatile acidity associate with high quality

Thus, I think alcohol, density and volatile acidity may associated with quality of wine. 

## 2. Use Hierarchical Cluster Analysis to identify cluster(s) of observations that have high and low values of the wine quality. (Assume all variables are continuous.) Use complete linkage and the same number of groups that you found to be the most meaningful in question 1. Describe variables that cluster with higher values of wine quality. Describe variables that cluster with lower values of wine quality. If you want to make a good bottle of wine, then what characteristics are most important according to this analysis? Have your conclusions changed using Hierarchical clustering rather than k means clustering? Present any figures that assist you in your analysis.

https://towardsdatascience.com/machine-learning-algorithms-part-12-hierarchical-agglomerative-clustering-example-in-python-1e18e0075019

https://towardsdatascience.com/https-towardsdatascience-com-hierarchical-clustering-6f3c98c9d0ca

### two types of hierarchical clustering algorithms
agglomerative: bottom up approach; start with many small clusters and merge them together to create bigger clusters;
divisive: top down approach; start with a single cluster than break it up into smaller clusters. 


In [None]:
from scipy.spatial.distance import pdist, squareform
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as sch

In [None]:
# create adjacency matrix and pdist applies Euclidean distance to each combo of observation
#row_dist = pd.DataFrame(squareform(pdist(df, metric='euclidean')), columns= df.columns)
#row_dist

#linkage criteria refers to how the distance between clusters is calculated

row_clusters = sch.linkage(pdist(df, metric = 'euclidean'), method = 'complete') # define distance metric and linkage for model
# complete linkage: the distance between two clusters is the longest distance between two points in each cluster.
# single linkage: the distance between two clusters is the shortest distance between two points in each cluster.
# average linkage: the distance between clusters is the average distance between each point in one cluster ti every point in other cluster.
# ward linkage: the distance between clusters is the sum of squared differences within all clusters; minimize the variant between clusters.
# row_clusters has meta-data we can use to visulize the HC fit with a dendrogram

In [None]:
# build dendrogram on complete linkage
dendrogram_complete = sch.dendrogram(row_clusters)
plt.tight_layout()
plt.ylabel('Euclidean Distance')
plt.show()

In [None]:
# build dendrogram on ward linkage
dendrogram_wald = sch.dendrogram(sch.linkage(df.values, method='ward'))

It is hard to determine the number of clusters using the complete linkage. However, the ward linkage shows that there is seven clusters, which is the same results as the kmean method. 

In [None]:
hc_model = AgglomerativeClustering(n_clusters=7, affinity='euclidean', linkage='complete')
hc_model.fit(df)
labels = hc_model.labels_

In [None]:
set(labels) # the unique label is from 0 to 6, which means there is 7 clusters.

In [None]:
df_hc = df
df_hc['labels'] = labels

In [None]:
hc_result = df_hc.groupby('labels').mean()
hc_result

In [None]:
plt.scatter(df.values[labels==0, 0], df.values[labels==0, 1], s=50, marker='o', color='red', label='cluster 1')
plt.scatter(df.values[labels==1, 0], df.values[labels==1, 1], s=50, marker='o', color='blue', label = 'cluster 2')
plt.scatter(df.values[labels==2, 0], df.values[labels==2, 1], s=50, marker='o', color='green', label = 'cluster 3')
plt.scatter(df.values[labels==3, 0], df.values[labels==3, 1], s=50, marker='o', color='purple', label = 'cluster 4')
plt.scatter(df.values[labels==4, 0], df.values[labels==4, 1], s=50, marker='o', color='orange', label = 'cluster 5')
plt.scatter(df.values[labels==5, 0], df.values[labels==5, 1], s=50, marker='o', color='gray', label = 'cluster 6')
plt.scatter(df.values[labels==6, 0], df.values[labels==6, 1], s=50, marker='o', color='yellow', label = 'cluster 7')
plt.legend()
plt.title('Hierachical Clustering')
plt.show()

In [None]:
fig, axs =plt.subplots(nrows=1, ncols=2, figsize=(15,15))

axs[0].scatter(df.values[kmean_7_labels==0, 0], df.values[kmean_7_labels==0, 1], s=50, marker='o', color='red', label='cluster 1')
axs[0].scatter(df.values[kmean_7_labels==1, 0], df.values[kmean_7_labels==1, 1], s=50, marker='o', color='blue', label = 'cluster 2')
axs[0].scatter(df.values[kmean_7_labels==2, 0], df.values[kmean_7_labels==2, 1], s=50, marker='o', color='green', label = 'cluster 3')
axs[0].scatter(df.values[kmean_7_labels==3, 0], df.values[kmean_7_labels==3, 1], s=50, marker='o', color='purple', label = 'cluster 4')
axs[0].scatter(df.values[kmean_7_labels==4, 0], df.values[kmean_7_labels==4, 1], s=50, marker='o', color='orange', label = 'cluster 5')
axs[0].scatter(df.values[kmean_7_labels==5, 0], df.values[kmean_7_labels==5, 1], s=50, marker='o', color='gray', label = 'cluster 6')
axs[0].scatter(df.values[kmean_7_labels==6, 0], df.values[kmean_7_labels==6, 1], s=50, marker='o', color='yellow', label = 'cluster 7')
axs[0].legend()
axs[0].set_title('KMeans Clustering')
axs[1].scatter(df.values[labels==0, 0], df.values[labels==0, 1], s=50, marker='o', color='red', label='cluster 1')
axs[1].scatter(df.values[labels==1, 0], df.values[labels==1, 1], s=50, marker='o', color='blue', label = 'cluster 2')
axs[1].scatter(df.values[labels==2, 0], df.values[labels==2, 1], s=50, marker='o', color='green', label = 'cluster 3')
axs[1].scatter(df.values[labels==3, 0], df.values[labels==3, 1], s=50, marker='o', color='purple', label = 'cluster 4')
axs[1].scatter(df.values[labels==4, 0], df.values[labels==4, 1], s=50, marker='o', color='orange', label = 'cluster 5')
axs[1].scatter(df.values[labels==5, 0], df.values[labels==5, 1], s=50, marker='o', color='gray', label = 'cluster 6')
axs[1].scatter(df.values[labels==6, 0], df.values[labels==6, 1], s=50, marker='o', color='yellow', label = 'cluster 7')
axs[1].legend()
axs[1].set_title('Hierachical Clustering')


The results from kmean clustering and hierarchical clustering (ward) is almost the same. However, the complete linkage leads to slight different clusters compared with kmean results. 

In [None]:
hc_result

From the table above, we could find that:

1. high quality positively associtated with alcohol;
2. high value of sulphates associate with low quality; low sulphates associate with high quality;
3. high value of chlorides associate with low quality; low chlorides associate with high quality;

Thus, I think alcohol, sulphates and chlorides may associated with quality of wine. 

## 3. Use Principal Components Analysis to reduce the dimensions of your data. How much of the variation in your data is explained by the first two principal components. How might you use the first two components to do supervised learning on some other variable tied to wine (e.g. - wine price)?

In [None]:
from sklearn.decomposition import PCA

In [None]:
## plot determine the number of components
pca = PCA().fit(df)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.grid(True)
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');

In [None]:
## let's choose the number of components that explain how much of the variance
print("To explain 80% of the variation of the data, we need " + str(PCA(0.80).fit(df).n_components_ )+ " components") #6 components should be chose
print("To explain 85% of the variation of the data, we need " + str(PCA(0.85).fit(df).n_components_ )+ " components") #7 components should be chose
print("To explain 90% of the variation of the data, we need " + str(PCA(0.90).fit(df).n_components_ )+ " components") #9 components should be chose
print("To explain 95% of the variation of the data, we need " + str(PCA(0.95).fit(df).n_components_ )+ " components") #10 components should be chose

In [None]:
## how many variance explained by 2 components "{:.0f}".format(x))
print("Variation explained by first two components: "+ "{:.0f}".format(100*sum(PCA(n_components=2).fit(df).explained_variance_ratio_))+"%")
## the first two components explain about 44% variance


In [None]:
pca_2 = PCA(n_components=2).fit(df)


In [None]:
components_2 = pca_2.transform(df)
components_2
len(components_2)

By deducting the dimension of 12 features to 2 components, we can then use the transformed components to fit a linear regression model to predict the outcome price. 