In [1]:
import torch
import pickle
import numpy as np
from kmeans_pytorch import kmeans
from utils import read, to_int, to_float, process, normalize

In [2]:
n_clusters = 5

In [3]:
school_list = read('./wa_secondary_schools.csv')
processed_array = process(school_list)
means = np.where(processed_array != -1, processed_array, np.nan)
means = np.nanmean(means, axis=0)
std_dev = np.std(processed_array)
processed_array = normalize(processed_array, means, std_dev, 'means_filling')
processed_tensor = torch.tensor(processed_array)
processed_tensor

tensor([[-1.4547e-04, -8.7209e-05, -1.9491e-05,  ...,  1.2693e+00,
          9.7726e-02, -4.4677e-02],
        [-3.0860e-05, -1.1811e-05,  2.0673e-05,  ..., -1.8985e+00,
          3.3168e-01, -7.3532e-01],
        [-4.4356e-05, -2.0197e-05,  1.7887e-05,  ...,  3.0333e+00,
          1.4453e+00,  8.2127e-02],
        ...,
        [-3.8911e-05, -1.6798e-05,  1.5392e-05,  ..., -2.8998e+00,
         -9.3635e+00, -7.3532e-01],
        [-3.1929e-05, -1.2470e-05,  2.0502e-05,  ..., -2.2728e+00,
         -9.3635e+00, -7.3532e-01],
        [-4.4282e-05, -2.0150e-05,  1.5397e-05,  ..., -1.9500e+00,
         -1.2592e+00, -7.3532e-01]], dtype=torch.float64)

In [4]:
labels, cluster_centers = kmeans(
    X=processed_tensor, num_clusters=n_clusters, distance='euclidean'
)

training_results = {
    'means': means,
    'std_dev': std_dev,
    'labels': labels,
    'centers': cluster_centers,
}

training_results

running k-means on cpu..


[running kmeans]: 8it [00:00, 1016.28it/s, center_shift=0.000000, iteration=8, tol=0.000100]


{'means': array([-5.11692093e-01,  8.56222924e-01,  8.92163665e-01, -4.49097068e-01,
         7.05739812e+02,  9.99557252e+02,  7.75741007e+01]),
 'std_dev': 427.4300898319394,
 'labels': tensor([2, 1, 3, 0, 4, 3, 2, 2, 0, 1, 3, 2, 2, 4, 0, 3, 0, 4, 2, 3, 2, 3, 1, 1,
         1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 0, 2, 2, 4, 3, 0, 3, 2, 2, 3, 4, 0, 2, 1,
         1, 2, 1, 0, 1, 4, 4, 0, 1, 2, 1, 4, 1, 1, 1, 2, 4, 1, 2, 0, 0, 3, 1, 0,
         1, 1, 2, 1, 2, 2, 1, 1, 0, 1, 1, 1, 1, 4, 0, 1, 2, 2, 0, 2, 3, 3, 2, 1,
         2, 1, 1, 1, 2, 1, 4, 1, 0, 3, 0, 2, 2, 0, 1, 1, 1, 1, 2, 2, 2, 3, 0, 4,
         0, 3, 2, 3, 1, 0, 2, 2, 1, 1, 3, 1, 1, 1, 3, 2, 1, 0, 1, 4, 3, 2, 4, 2,
         2, 4, 0, 1, 3, 1, 2, 1, 0, 3, 1, 1, 1, 3, 3, 3, 1, 0, 2, 1, 3, 1, 1, 3,
         1, 3, 2, 0, 3, 3, 1, 3, 0, 2, 1, 3, 0, 4, 2, 2, 3, 2, 2, 1, 3, 2, 4, 2,
         3, 2, 1, 3, 2, 4, 0, 1, 1, 2, 3, 1, 1, 2, 4, 1, 2, 1, 1, 2, 1, 2, 1, 1,
         3, 0, 0, 0, 1, 3, 3, 0, 3, 2, 2, 3, 2, 1, 3, 3, 2, 1, 3, 1, 0, 2, 1, 0,
   

In [5]:
with open('kmeans_model.pkl', 'wb') as file:
    pickle.dump(training_results, file)