# The One Goal For Today

Understand how normalization first can lead to better or more efficient clustering and classification models.

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn
import scipy

# Load and Look at Your Data

The data set we wil be analyzing is our usual car dataset from Craigslist. 

First we load the data.

In [5]:
# these will be our columns
columns = ["price", "year", "manufacturer", "model", "condition", "fuel", "odometer", "title_status", "transmission"]
# this will contain our converters
colValues = {}

# first we load our data as strings so we can define the converters
data = np.array(np.genfromtxt('data/vehicles.csv', delimiter=',', usecols=(1,2,3,4,5,7,8,9,11), skip_header=1, dtype=str, encoding='utf-8'))  

# make a list of the unique values in each column of our data
for colIndex in range(data.shape[1]):
    colValues[colIndex] = np.unique(data[:, colIndex]).tolist()
    print(colIndex, colValues[colIndex])

# map values to their indices in the list of unique values
def converter(x, colIndex):
    return colValues[colIndex].index(x)
    
data = np.array(np.genfromtxt('data/vehicles.csv', delimiter=',', usecols=(1,2,3,4,5,7,8,9,11), converters={3: lambda x: converter(x, 2), 4: lambda x: converter(x, 3), 5: lambda x: converter(x, 4), 7: lambda x: converter(x,5), 9: lambda x: converter(x, 7), 11: lambda x: converter(x, 8)}, skip_header=1, dtype=int, encoding='utf-8'))  

0 ['0', '1', '1000', '10000', '10199', '10200', '10300', '10399', '10400', '10450', '10475', '10488', '10491', '10494', '10495', '10498', '10500', '10550', '10600', '10650', '10691', '10695', '10700', '10800', '10844', '10888', '10900', '10950', '10977', '10980', '10988', '10989', '10990', '10995', '10999', '11000', '11011', '11200', '11250', '11272', '11375', '11380', '11450', '11480', '11493', '11495', '11499', '1150', '11500', '11515', '11550', '11600', '11700', '11795', '11900', '11901', '1195', '11950', '11976', '11985', '11987', '11988', '11990', '11995', '11997', '11999', '1200', '12000', '12177', '12250', '12400', '12450', '12488', '12495', '12499', '1250', '12500', '12588', '12595', '1275', '12775', '12835', '12875', '12889', '12900', '1295', '12950', '12977', '12990', '12995', '12999', '1300', '13000', '13299', '13300', '13325', '13400', '13440', '13450', '13488', '13490', '13494', '13495', '13500', '13599', '13600', '13650', '13750', '13753', '13795', '13850', '13860', '1388

Then we get summary statistics.

In [6]:
def getSummaryStatistics(data):
    print("min, max, mean, std per variable")
    return pd.DataFrame([data.min(axis=0), data.max(axis=0), data.mean(axis=0), data.std(axis=0)])

def getShapeType(data):
    print("shape")
    return (data.shape, data.dtype)

print(getSummaryStatistics(data))
print(getShapeType(data))

min, max, mean, std per variable
              0            1         2           3         4         5  \
0      0.000000  1999.000000  0.000000    0.000000  0.000000  0.000000   
1  55000.000000  2021.000000  1.000000  340.000000  5.000000  4.000000   
2   7978.281507  2012.381887  0.445535  159.225142  1.061431  2.012350   
3   5542.906703     3.997048  0.497025   96.160745  1.225660  0.138432   

              6         7         8  
0  0.000000e+00  0.000000  0.000000  
1  9.999999e+06  5.000000  2.000000  
2  1.124405e+05  0.209943  0.886954  
3  2.546163e+05  0.910812  0.424340  
shape
((3158, 9), dtype('int64'))


# Split the data

If we are doing supervised machine learning, we split the data into train and test. 

However, here we are doing clustering, so we don't.

In [7]:
train = data

If we had a clear dependent variable (as we do with the car logo dataset) we'd strip it off. However, here we don't.


no depdnet varibale

In [8]:
#y_train = train[:, -1]
#x_train = train[:, 0:-1]
#y_test = test[:, -1]
#x_test = test[:, 0:-1]
x_train = train

# Normalization Review

Here we implement max-min global, max-min local, z-score and center. This code comes from day 20.

This code you can use as a **tool**.

**If you are using separate training and test data, you want to normalize to the mean (min, max, std) of the _training data_.**

In [9]:
def normalize(data, min, max, mean, std, method='center'):
    if method == 'center':
        return data - mean
    elif method == 'max-min-global':
        return (data - min) / (max - min)
    elif method == 'max-min-local':
        return (data - min) / (max - min)
    elif method == 'zscore':
        return (data - mean) / std
    else:
        raise Exception("I can't do " + method)

Let's try it!

**When you are doing supervised machine learning, you always want to normalize using statistics (mean, min, max) from your training data**.

In [10]:
min_g = np.min(x_train)
max_g = np.max(x_train)
min_l = np.min(x_train, axis=0)
max_l = np.max(x_train, axis=0)
mean = np.mean(x_train, axis=0)
std = np.std(x_train, axis=0)
normalized_train = normalize(x_train, min_l, max_l, mean, std, method='max-min-local')
# normalized_train = normalize(train, min_g, max_g, mean, std, method='center')
# normalized_train = normalize(train, min_g, max_g, mean, std, method='max-min-global')
# normalized_train = normalize(train, min_g, max_g, mean, std, method='zscore')

# K-means Clustering Review

The code below comes from day 22.

You can use this code as a **tool**.

In [11]:
# Euclidean distance
def distance(a, b):
    subtracted = a-b
    return np.sqrt(np.dot(subtracted.T, subtracted))

# Calculate the distance from each data point to each centroid
def get_distances(item, centroids):
    return [distance(item, centroid) for centroid in centroids]

# Update cluster assignments given a set of centroids
# input: list of data points, initial list of centroids
def update_clusters(data, centroids):
    # initialize clusters
    clusters = {}
    for i in range(len(centroids)):
        # set its cluster members to the empty list
        clusters[i] = []
    # initialize mappings
    mappings = {}
    # for each data point
    for j, datum in enumerate(data):
        # find the index of the centroid with the smallest distance to this data point
        min_cluster_index = np.argmin(get_distances(datum, centroids))
        # add this data point to that centroid's cluster
        clusters[min_cluster_index].append(datum)
        # add mapping
        mappings[j] = min_cluster_index
    return clusters, mappings

# Update the centroids given the data
def update_centroids(clusters, oldcentroids):
    # set centroids to empty list
    centroids = []
    # for each set of data points in a cluster around a single centroid
    for centroidid, data_in_cluster in clusters.items():
        # graciously handle case where no data ended up in a cluster
        if len(data_in_cluster) > 0:
            # new centroid is the mean of that cluster
            centroids.append(np.mean(data_in_cluster, axis=0))
        else:
            centroids.append(oldcentroids[centroidid])
    return centroids

# Measure the inertia
def inertia(data, centroids, clusters):
    sum = 0
    for i in clusters.keys():
        for datum in clusters[i]:
            # calculate the distance squared between each data point and its centroid
            sum += distance(datum, centroids[i])**2
    # average over the data
    return sum / len(data)

def fit_kmeans(data, k, cutoff=1):
    # make some initial centroids
    centroids = np.array([data[x] for x in np.random.choice(np.arange(len(data)), size=k, replace=False)])
    # initialize last_inertia
    last_inertia = -1
    while True:
        # get the clusters for these centroids
        clusters, mappings = update_clusters(data, centroids)
        # calculate the inertia for this clustering
        this_inertia = inertia(data, centroids, clusters)
        # stop when the inertia stops changing very much
        if last_inertia > 0 and abs(last_inertia - this_inertia) < cutoff:
            break
        last_inertia = this_inertia
        # update the centroids
        centroids = update_centroids(clusters, centroids)
    return centroids, clusters, mappings, this_inertia

On Wednesday we talked about the Silhouette coefficient as a way to evaluate the goodness of a clustering. We will use the scikit-learn implementation today.

In [12]:
from sklearn.metrics import silhouette_score

s = silhouette_score(data, mappings/labels, metric='euclidean')



# Impact of normalization on K-Means clustering

Fill in this table.
1. Try all the types of normalization plus k-means clustering. Use a reasonable value for $k$ in k-means clustering, like 6 (maybe it will cluster them by condition!).
2. Try at least one type of normalization (centering!) plus PCA plus k-means clustering. Use the same value of $k$ for k-means clustering as you have so far. Pick a number of principal components that lets you keep at least 80% of the cumulative sum of variance.

| Normalization | PCA (None or k) | K-means k | Silhouette score | Time |
| ------------- | --------------- | --------- | ---------------- | ---- |
| None | None | 7 | 0.49563619444703105| 6.2|
| Centering | None | 7 | 0.49563619443355067| 7.39|
| Max-min global | None | 7 | 0.44644257060903064 | 6.2 |
| Max-min local | None | 7 | 0.34687496989437483| 5.48 |
| Z-score | None | 7| 0.1566625885440842 | 5.48|
| ?? | ?? | 7 | | |



higher k, lower SC more clusters more overalap




In [13]:
k = 7

In [14]:
min_g = np.min(x_train)
max_g = np.max(x_train)
min_l = np.min(x_train, axis=0)
max_l = np.max(x_train, axis=0)
mean = np.mean(x_train, axis=0)
std = np.std(x_train, axis=0)

In [15]:
%time 

normalized_train = normalize(x_train, min_g, max_g, mean, std, method='center')
centroids, clusters, mappings, _ = fit_kmeans(normalized_train, k)
silhouette_score(normalized_train, [x[1] for x in sorted(mappings.items())], metric='euclidean')

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.39 µs


0.49563619443355067

In [20]:
%time 

normalized_train = normalize(x_train, min_g, max_g, mean, std, method='max-min-global')
centroids, clusters, mappings, _ = fit_kmeans(normalized_train, k)
silhouette_score(normalized_train, [x[1] for x in sorted(mappings.items())], metric='euclidean')

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.2 µs


0.44644257060903064

In [17]:
%time 

normalized_train = normalize(x_train, min_g, max_g, mean, std, method='max-min-local')
centroids, clusters, mappings, _ = fit_kmeans(normalized_train, k)
silhouette_score(normalized_train, [x[1] for x in sorted(mappings.items())], metric='euclidean')

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.48 µs


0.34687496989437483

In [18]:
%time 

normalized_train = normalize(x_train, min_g, max_g, mean, std, method='zscore')
centroids, clusters, mappings, _ = fit_kmeans(normalized_train, k)
silhouette_score(normalized_train, [x[1] for x in sorted(mappings.items())], metric='euclidean')

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.48 µs


0.1566625885440842

In [19]:
%time 

normalized_train = train
centroids, clusters, mappings, _ = fit_kmeans(normalized_train, k)
silhouette_score(normalized_train, [x[1] for x in sorted(mappings.items())], metric='euclidean')

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.2 µs


0.49563619444703105

**Bonus**: Now think about PCA. If we had a dataset with 1000 independent variables (like our car logo data), what do you think might be the impact of PCA-first on silhouette coefficient, and on time?


better clustering 

more time efficient


for k nearest neighbors:
more time efficient, but can not say about accuracy until actually do it

Parameters in common with k nearest neighbors and k means clusters are k and distance
K means lcusters versus k means neighbors:
k is how many clusyers or how many neighbors to look at when making classification decision. Depndent vairable in this is given by y variable 


for k means lcustering, the less norrmlaization the better





SIlhouette coefiicient ranges from -1 to 1

tight clusters are closest to 1

if 0 menas cluster overalpss

if - means wrong cluster assisgnmnet 

for k nearest neighbors, the fewer neighbors the more time efficient. Use elbow plot of accuracy and k. 



smaller distances with min max local, more overallping clusters

z score will make everyhting closer, more overalaping clusters 

accuracy ranges from 0 to 1
