# Using the Cluster Module in SciPy


In [4]:
import scipy

In [5]:
from pathlib import Path
import numpy as np
from scipy.cluster.vq import whiten, kmeans, vq

In [6]:
data = Path("SMSSpamCollection").read_text()
data = data.strip()
data = data.split("\n")

In [8]:
# Create empty numpy array of shape 5574 x 2
# More memory-efficient to do this beforehand instead of appending in loop

digit_counts = np.empty((len(data), 2), dtype=int)

In [9]:
# for count, value in enumerate(values):

for i, line in enumerate(data):
    case, message = line.split("\t")
    num_digits = sum(c.isdigit() for c in message)
    digit_counts[i, 0] = 0 if case == "ham" else 1         # ham = 1,   spam = 2
    digit_counts[i, 1] = num_digits

In [10]:
digit_counts

array([[ 0,  0],
       [ 0,  0],
       [ 1, 25],
       ...,
       [ 0,  0],
       [ 0,  0],
       [ 0,  0]])

In [11]:
unique_counts = np.unique(digit_counts[:, 1], return_counts=True)

In [12]:
# The first column has the number of digits in a message, and the second column is the number of messages that have that number of digits.

unique_counts

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 40, 41, 47]),
 array([4110,  486,  160,   78,   42,   39,   16,   14,   28,   17,   16,
          34,   30,   31,   37,   29,   35,   33,   41,   47,   18,   31,
          28,   36,   34,   16,   16,   13,   19,    9,    2,    6,    3,
           4,    3,    4,    1,    1,    4,    2,    1], dtype=int64))

In [13]:
# Transform shape into one suitable for clustering

unique_counts = np.transpose(np.vstack(unique_counts))
unique_counts

array([[   0, 4110],
       [   1,  486],
       [   2,  160],
       [   3,   78],
       [   4,   42],
       [   5,   39],
       [   6,   16],
       [   7,   14],
       [   8,   28],
       [   9,   17],
       [  10,   16],
       [  11,   34],
       [  12,   30],
       [  13,   31],
       [  14,   37],
       [  15,   29],
       [  16,   35],
       [  17,   33],
       [  18,   41],
       [  19,   47],
       [  20,   18],
       [  21,   31],
       [  22,   28],
       [  23,   36],
       [  24,   34],
       [  25,   16],
       [  26,   16],
       [  27,   13],
       [  28,   19],
       [  29,    9],
       [  30,    2],
       [  31,    6],
       [  32,    3],
       [  33,    4],
       [  34,    3],
       [  35,    4],
       [  36,    1],
       [  37,    1],
       [  40,    4],
       [  41,    2],
       [  47,    1]], dtype=int64)

You use `whiten()` to normalize each feature to have unit variance, which improves the results from `kmeans()`. Then, `kmeans()` takes the whitened data and the number of clusters to create as arguments.

In [14]:
whitened_counts = whiten(unique_counts)
codebook, _ = kmeans(whitened_counts, 3)

In [16]:
# kmeans returns an array with three rows and two columns representing the centroids of each group

codebook

array([[2.52050073, 0.01840656],
       [0.85234324, 0.09724666],
       [0.        , 6.49364346]])

In [22]:
# vq assigns each unique_counts value to a cluster.

codes, _ = vq(whitened_counts, codebook)

In [18]:
codes

array([2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [23]:
ham_code = codes[0]
spam_code = codes[-1]
unknown_code = list(set(range(3)) ^ set((ham_code, spam_code)))[0]

In [24]:
print("definitely ham:", unique_counts[codes == ham_code][-1])
print("definitely spam:", unique_counts[codes == spam_code][-1])
print("unknown:", unique_counts[codes == unknown_code][-1])

definitely ham: [   0 4110]
definitely spam: [47  1]
unknown: [20 18]


In [25]:
digits = digit_counts[:, 1]
predicted_hams = digits == 0
predicted_spams = digits > 20
predicted_unknowns = np.logical_and(digits > 0, digits <= 20)

In [26]:
spam_cluster = digit_counts[predicted_spams]
ham_cluster = digit_counts[predicted_hams]
unk_cluster = digit_counts[predicted_unknowns]

In [27]:
print("hams:", np.unique(ham_cluster[:, 0], return_counts=True))
print("spams:", np.unique(spam_cluster[:, 0], return_counts=True))
print("unknowns:", np.unique(unk_cluster[:, 0], return_counts=True))

hams: (array([0, 1]), array([4071,   39], dtype=int64))
spams: (array([0, 1]), array([  1, 232], dtype=int64))
unknowns: (array([0, 1]), array([755, 476], dtype=int64))


In [28]:
# e.g. From this output, you can see that 4110 messages fell into the definitely ham group, of which 4071 were actually ham and only 39 were spam. 

# Using the Optimize Module in SciPy


In [29]:
from scipy.optimize import minimize_scalar

def objective_function(x):
    return 3 * x ** 4 - 2 * x + 1

In [31]:
res = minimize_scalar(objective_function)
res

     fun: 0.17451818777634331
    nfev: 16
     nit: 12
 success: True
       x: 0.5503212087491959

In [33]:
def objective_function(x):
    return x ** 4 - x ** 2

In [34]:
res = minimize_scalar(objective_function)
res

     fun: -0.24999999999999994
    nfev: 15
     nit: 11
 success: True
       x: 0.7071067853059209

In [35]:
res = minimize_scalar(objective_function, bracket=(-1, 0))
res

     fun: -0.24999999999999997
    nfev: 17
     nit: 13
 success: True
       x: 0.7071067809244586

In [36]:
res = minimize_scalar(objective_function, method='bounded', bounds=(-1, 0))
res

     fun: -0.24999999999998732
 message: 'Solution found.'
    nfev: 10
  status: 0
 success: True
       x: -0.707106701474177

### Minimizing a Function With Many Variables


In [37]:
import numpy as np
from scipy.optimize import minimize, LinearConstraint

n_buyers = 10
n_shares = 15

In [38]:
np.random.seed(10)
prices = np.random.random(n_buyers)
money_available = np.random.randint(1, 4, n_buyers)

In [42]:
n_shares_per_buyer = money_available / prices
print(money_available, prices, n_shares_per_buyer, sep="\n")

[1 1 1 3 1 3 3 2 1 1]
[0.77132064 0.02075195 0.63364823 0.74880388 0.49850701 0.22479665
 0.19806286 0.76053071 0.16911084 0.08833981]
[ 1.29647768 48.18824404  1.57816269  4.00638948  2.00598984 13.34539487
 15.14670609  2.62974258  5.91328161 11.3199242 ]


In [43]:
constraint = LinearConstraint(np.ones(n_buyers), lb=n_shares, ub=n_shares)

In [46]:
bounds = [(0, n) for n in n_shares_per_buyer]
bounds

[(0, 1.296477682439221),
 (0, 48.18824403823818),
 (0, 1.5781626853523065),
 (0, 4.006389483224008),
 (0, 2.0059898362934296),
 (0, 13.3453948697305),
 (0, 15.146706090719757),
 (0, 2.629742583593113),
 (0, 5.913281610609325),
 (0, 11.31992419669592)]