In [1]:
# Import Dependencies
import plotly.express as px
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

## Wine Quality

## Description
The combined dataset includes red and white variants of the Portuguese "Vinho Verde" wine. Only physicochemical (inputs) and sensory (the output) variables are available (e.g. there is no data about grape types, wine brand, wine selling price, etc.).

#### Attribute Information:

Continuous input variables (based on physicochemical tests):
- fixed acidity
- volatile acidity
- citric acid
- residual sugar
- chlorides
- free sulfur dioxide
- total sulfur dioxide
- density
- pH
- sulphates
- alcohol

Labeled input variables (based on sensory and category):
- quality (score between 0 and 10)
- color (White: 0; Red: 1)



## Source
https://archive.ics.uci.edu/ml/datasets/wine+quality

In [5]:
# Read the csv file into a pandas DataFrame called `wine_df`

### YOUR CODE HERE
wine_df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv')  
wine_df.head()

Unnamed: 0,"fixed acidity;""volatile acidity"";""citric acid"";""residual sugar"";""chlorides"";""free sulfur dioxide"";""total sulfur dioxide"";""density"";""pH"";""sulphates"";""alcohol"";""quality"""
0,7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5
1,7.8;0.88;0;2.6;0.098;25;67;0.9968;3.2;0.68;9.8;5
2,7.8;0.76;0.04;2.3;0.092;15;54;0.997;3.26;0.65;...
3,11.2;0.28;0.56;1.9;0.075;17;60;0.998;3.16;0.58...
4,7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5


## Use K-Means to predict clusters

## Find the best value for _k_ using an Elbow Curve

In [None]:
# Setup for loop

### YOUR CODE HERE
inertia = []
k = list(range(1, 11))

In [None]:
# Look for the best k

### YOUR CODE HERE
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(wine_df)
    inertia.append(km.inertia_)

In [None]:
# Plot the Elbow Curve using plotly

### YOUR CODE HERE
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

px.scatter(x=k,y=inertia,labels={"x":"Number of clusters","y":"Inertia"}) 

In [None]:
# Initialize a model with K based on the Elbow plot above

### YOUR CODE HERE
model = KMeans(n_clusters=3, random_state=5)

In [None]:
# Fit the model

### YOUR CODE HERE
model.fit(wine_df)

In [None]:
# Add a new `class` column to wine_df

### YOUR CODE HERE
wine_df["class"] = model.labels_
wine_df.head()

In [None]:
# Create a 3D scatter plot to visualize our clustering 
# usings `total sulfur dioxide` and any two other features

### YOUR CODE HERE
px.scatter_3d(wine_df,x="total sulfur dioxide",y="fixed acidity",z="volatile acidity",color="class")

Notice how `total sulfur dioxide` basically drives the decision of `class`. Why might this be? Do you think the scale of values compared to all other features matters? Confirm this hypothesis by next performing scaling.  

## Standard Scalar

Documentation: [StandarScaler()](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)

In [None]:
# Drop the class from our wine data

### YOUR CODE HERE
wine_df = wine_df.drop("class",axis=1)

In [None]:
# Scale all fields with the StandardScaler() function based on the documentation above.
# Initialize a `StandardScaler` object

### YOUR CODE HERE
scaler = StandardScaler()

In [None]:
# Fit and Transform the `wine_df` with the StandardScaler 
# Store the results in the variable `scaled_features`

### YOUR CODE HERE
scaled_features = scaler.fit_transform(wine_df)

In [None]:
# Creates a new DataFrame based on the `scaled_featuers` and the original index and column values
wine_df_scaled = pd.DataFrame(scaled_features, index=wine_df.index, columns=wine_df.columns)
wine_df_scaled.head()

## Find a new best value for _k_ of scaled data using an Elbow Curve

In [None]:
# Setup for loop

### YOUR CODE HERE
inertia = []
k = list(range(1, 11))

In [None]:
# Look for the new best k

### YOUR CODE HERE
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(wine_df_scaled)
    inertia.append(km.inertia_)

In [None]:
# Plot the Elbow Curve using plotly

### YOUR CODE HERE
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

px.scatter(x=k,y=inertia,labels={"x":"Number of clusters","y":"Inertia"})

In [None]:
# Initialize a mew model with K based on the Elbow plot above

### YOUR CODE HERE
model_scaled = KMeans(n_clusters=4, random_state=5)

In [None]:
# Fit the new model

### YOUR CODE HERE
model_scaled.fit(wine_df_scaled)

In [None]:
# Add a new class column to wine_df_scaled

### YOUR CODE HERE
wine_df_scaled["class"] = model_scaled.labels_
wine_df_scaled.head()

In [None]:
# Create a 3D scatter plot to visualize our clustering 
# usings the same features as before

### YOUR CODE HERE
px.scatter_3d(wine_df_scaled,x="total sulfur dioxide",y="fixed acidity",z="volatile acidity",color="class")

Does `total sulfur dioxide` still drive the decision of `class` or do new features have an impact?