In [4]:
import scipy as sp
from pathlib import Path 
from scipy.cluster.vq import whiten, kmeans, vq
import numpy as np


In [6]:
data = Path('SMSSpamCollection').read_text()
data = data.strip()
data = data.split('\n')

In [28]:
digit_counts = np.empty((len(data), 2), dtype = int)

In [35]:
for i, line in enumerate(data):
    case, message = line.split('\t')
    num_digits = sum(c.isdigit() for c in message)
    digit_counts[i, 0] = 0 if case == 'ham' else 1 
    digit_counts[i,1] = num_digits

In [37]:
matprint(digit_counts)

0   0  
0   0  
1  25  
0   0  
0   0  
1   4  
0   0  
0   1  
1  19  
1  13  
0   0  
1  22  
1  20  
0   0  
0   0  
1   0  
0   0  
0   1  
0   0  
1  23  
0   0  
0   1  
0   0  
0   1  
0   0  
0   0  
0   0  
0   0  
0   0  
0   0  
0   0  
0   2  
0   0  
0   0  
1   1  
0   2  
0   0  
0   0  
0   0  
0   0  
0   0  
0   0  
1  22  
0   0  
0   0  
0   0  
0   0  
0   0  
0   0  
0   0  
0   0  
0   0  
0   0  
0   0  
1   0  
0   0  
1  24  
0   0  
0   0  
0   0  
0   1  
0   0  
0   0  
0   0  
0   0  
1  15  
0   0  
1  21  
1   0  
0   0  
0   0  
0   0  
0   0  
0   0  
0   0  
0   0  
0   0  
0   0  
0   0  
0   0  
0   0  
0   0  
0   0  
0   0  
0   0  
0   0  
0   1  
0   0  
0   0  
0   0  
0   0  
0   2  
0   1  
1  22  
0   3  
1  18  
0   0  
0   0  
0   0  
0   0  
0   0  
0   0  
0   1  
0   1  
0   0  
0   0  
0   0  
0   0  
0   0  
0   0  
0   0  
0   0  
0   0  
0   1  
1  22  
0   0  
0   1  
1  22  
0   0  
0   0  
1  34  
1  26  
0   0  
1  22  
0   2  


In [11]:
def matprint(mat, fmt="g"):
    col_maxes = [max([len(("{:"+fmt+"}").format(x)) for x in col]) for col in mat.T]
    for x in mat:
        for i, y in enumerate(x):
            print(("{:"+str(col_maxes[i])+fmt+"}").format(y), end="  ")
        print("")

In [38]:
unique_counts = np.unique(digit_counts[:, 1], return_counts=True)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 40, 41, 47]),
 array([4110,  486,  160,   78,   42,   39,   16,   14,   28,   17,   16,
          34,   30,   31,   37,   29,   35,   33,   41,   47,   18,   31,
          28,   36,   34,   16,   16,   13,   19,    9,    2,    6,    3,
           4,    3,    4,    1,    1,    4,    2,    1], dtype=int64))

In [17]:
unique_counts = np.transpose(np.vstack(unique_counts))

In [21]:
whitened_counts = whiten(unique_counts)
codebook, _ = kmeans(whitened_counts, 3)
codes, _ = vq(whitened_counts, codebook)

In [42]:
unknown_code

1

In [22]:
ham_code = codes[0]
spam_code = codes[-1]
unknown_code = list(set(range(3)) ^ set((ham_code, spam_code)))[0]

In [23]:
print("definitely ham:", unique_counts[codes == ham_code][-1])
print("definitely spam:", unique_counts[codes == spam_code][-1])
print("unknown:", unique_counts[codes == unknown_code][-1])

definitely ham: [   0 4110]
definitely spam: [47  1]
unknown: [20 18]


In [24]:
digits = digit_counts[:, 1]
predicted_hams = digits == 0
predicted_spams = digits > 20
predicted_unknowns = np.logical_and(digits > 0, digits <= 20)

In [25]:
spam_cluster = digit_counts[predicted_spams]
ham_cluster = digit_counts[predicted_hams]
unk_cluster = digit_counts[predicted_unknowns]

In [26]:
print("hams:", np.unique(ham_cluster[:, 0], return_counts=True))
print("spams:", np.unique(spam_cluster[:, 0], return_counts=True))
print("unknowns:", np.unique(unk_cluster[:, 0], return_counts=True))

hams: (array([0, 1]), array([4071,   39], dtype=int64))
spams: (array([0, 1]), array([  1, 232], dtype=int64))
unknowns: (array([0, 1]), array([755, 476], dtype=int64))
