In [1]:
import torch
import pickle
import numpy as np
from kmeans_pytorch import kmeans
from utils import read, to_int, to_float, process, normalize

In [2]:
n_clusters = 5

In [3]:
school_list = read('./wa_secondary_schools.csv')
processed_array = process(school_list)
means = np.where(processed_array != -1, processed_array, np.nan)
means = np.nanmean(means, axis=0)
std_dev = np.std(processed_array)
processed_array = normalize(processed_array, means, std_dev, 'means_filling')
processed_tensor = torch.tensor(processed_array)
processed_tensor

[[-5.73868738e-01  8.18947295e-01  8.83832808e-01 ...  9.77000000e+02
   1.01000000e+03  7.28000000e+01]
 [-5.24882420e-01  8.51174744e-01  9.00999968e-01 ...  3.00000000e+02
   1.03500000e+03 -1.00000000e+00]
 [-5.30651283e-01  8.47590240e-01  8.99809034e-01 ...  1.35400000e+03
   1.15400000e+03  8.63500000e+01]
 ...
 [-5.28323686e-01  8.49043040e-01  8.98742801e-01 ...  8.60000000e+01
  -1.00000000e+00 -1.00000000e+00]
 [-5.25339485e-01  8.50892722e-01  9.00926725e-01 ...  2.20000000e+02
  -1.00000000e+00 -1.00000000e+00]
 [-5.30619586e-01  8.47610084e-01  8.98744602e-01 ...  2.89000000e+02
   8.65000000e+02 -1.00000000e+00]]


tensor([[-1.4547e-04, -8.7209e-05, -1.9491e-05,  ...,  1.2693e+00,
          4.8863e-02, -2.2339e-02],
        [-3.0860e-05, -1.1811e-05,  2.0673e-05,  ..., -1.8985e+00,
          1.6584e-01, -3.6766e-01],
        [-4.4356e-05, -2.0197e-05,  1.7887e-05,  ...,  3.0333e+00,
          7.2266e-01,  4.1064e-02],
        ...,
        [-3.8911e-05, -1.6798e-05,  1.5392e-05,  ..., -2.8998e+00,
         -4.6817e+00, -3.6766e-01],
        [-3.1929e-05, -1.2470e-05,  2.0502e-05,  ..., -2.2728e+00,
         -4.6817e+00, -3.6766e-01],
        [-4.4282e-05, -2.0150e-05,  1.5397e-05,  ..., -1.9500e+00,
         -6.2961e-01, -3.6766e-01]], dtype=torch.float64)

In [4]:
labels, cluster_centers = kmeans(
    X=processed_tensor, num_clusters=n_clusters, distance='cosine'
)

training_results = {
    'means': means,
    'std_dev': std_dev,
    'labels': labels,
    'centers': cluster_centers,
}

training_results

running k-means on cpu..


[running kmeans]: 7it [00:00, 398.04it/s, center_shift=0.000000, iteration=7, tol=0.000100]


{'means': array([-5.11692093e-01,  8.56222924e-01,  8.92163665e-01, -4.49097068e-01,
         7.05739812e+02,  9.99557252e+02,  7.75741007e+01]),
 'std_dev': 427.4300898319394,
 'labels': tensor([0, 2, 1, 3, 0, 0, 3, 3, 3, 2, 0, 2, 2, 0, 3, 0, 3, 0, 4, 0, 0, 0, 2, 2,
         2, 2, 2, 2, 2, 0, 2, 0, 2, 1, 4, 0, 4, 0, 0, 4, 0, 3, 0, 0, 0, 4, 2, 2,
         4, 3, 2, 3, 2, 0, 1, 4, 2, 1, 2, 0, 2, 2, 2, 2, 0, 2, 1, 3, 4, 0, 2, 3,
         2, 2, 2, 2, 0, 0, 2, 4, 4, 2, 2, 2, 2, 0, 3, 2, 0, 2, 4, 2, 0, 0, 2, 2,
         0, 2, 2, 4, 2, 2, 0, 2, 3, 1, 4, 2, 0, 3, 2, 2, 2, 2, 0, 2, 1, 0, 3, 1,
         4, 0, 0, 0, 2, 4, 0, 1, 2, 2, 0, 2, 2, 2, 1, 0, 2, 4, 2, 0, 0, 2, 0, 0,
         1, 1, 3, 2, 0, 2, 3, 2, 4, 0, 2, 2, 2, 0, 1, 0, 4, 3, 1, 2, 0, 2, 2, 0,
         2, 0, 2, 4, 1, 1, 2, 0, 4, 0, 4, 0, 3, 0, 2, 2, 0, 1, 1, 2, 0, 1, 0, 2,
         1, 1, 2, 0, 4, 0, 4, 2, 2, 2, 0, 2, 2, 2, 0, 2, 4, 2, 2, 0, 2, 4, 2, 2,
         0, 4, 4, 4, 2, 1, 1, 4, 1, 2, 1, 0, 3, 2, 1, 1, 1, 2, 0, 2, 4, 1, 2, 3,
   

In [5]:
with open('kmeans_model.pkl', 'wb') as file:
    pickle.dump(training_results, file)