In [1]:
import csv
import torch
import numpy as np
import matplotlib.pyplot as plt
from kmeans_pytorch import kmeans, kmeans_predict

In [2]:
def to_int(value):
    try:
        return int(value)
    except ValueError:
        return -1
def to_float(value):
    try:
        return float(value)
    except ValueError:
        return -1

In [3]:
batch_size = 10
n_clusters = 9
def read(path):
    school_list = []
    with open(path, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            school = {}
            school['name'] = row['School Name']
            school['lat'] = to_float(row['Latitude'])
            school['lon'] = to_float(row['Longitude'])
            school['num_student'] = to_int(row['Total Students'])
            school['atar'] = to_float(row['Median ATAR'])
            school['icsea'] = to_int(row['ICSEA']) 
            school_list.append(school)
    return school_list

In [4]:
def process(school_list):
    school_ndarray = np.array(school_list)
    processed_list = []
    for i in range(len(school_ndarray)):
        school = school_ndarray[i]
        lat_rad = np.radians(school['lat'])
        lon_rad = np.radians(school['lon'])
        lat_sin = np.sin(lat_rad)
        lat_cos = np.cos(lat_rad)
        lon_sin = np.sin(lon_rad)
        lon_cos = np.cos(lon_rad)
        total_students = school['num_student']
        icsea = school['icsea']
        atar_percent = school['atar']
        data = [lat_sin, lat_cos, lon_sin, lon_cos, total_students, icsea, atar_percent];
        processed_list.append(data)
    processed_array = np.array(processed_list)
    return processed_array;

In [5]:
def fill_missing_with_mean(data):
    # Convert to numpy array for easier manipulation
    data = np.array(data, dtype=float)
    
    # Compute means ignoring -1
    means = np.where(data != -1, data, np.nan)
    column_means = np.nanmean(means, axis=0)
    
    # Fill -1 with column means
    for i in range(data.shape[1]):
        data[:, i] = np.where(data[:, i] == -1, column_means[i], data[:, i])
    
    return data

In [6]:
def normalize(processed_array, means, std_dev, miss_value_handling):
    weights = np.array([1, 1, 1, 1, 2, 4, 4])
    processed_array = (processed_array - means) / std_dev
    print(means)
    print(std_dev)
    processed_array = weights * processed_array
    if miss_value_handling == "means_filling":
        processed_array = np.where(processed_array == -1, means, processed_array)
    else:
        rows_to_remove = np.any(processed_array == -1, axis=1)
        processed_array = processed_array[~rows_to_remove]
        # school_ndarray = school_ndarray[~rows_to_remove]
    return processed_array

In [7]:
school_list = read('/Users/pt/Projects/Github/aibuild/hello-aibuild/research-test/wa_secondary_schools.csv')
processed_array = process(school_list)
means = np.where(processed_array != -1, processed_array, np.nan)
means = np.nanmean(means, axis=0)
std_dev = np.std(processed_array)
processed_array = normalize(processed_array, means, std_dev, 'means_filling')
processed_tensor = torch.tensor(processed_array)

[-5.11692093e-01  8.56222924e-01  8.92163665e-01 -4.49097068e-01
  7.05739812e+02  9.99557252e+02  7.75741007e+01]
427.4300898319394


In [8]:
labels, cluster_centers = kmeans(
    X=processed_tensor, num_clusters=n_clusters, distance='euclidean'
)

running k-means on cpu..


[running kmeans]: 32it [00:00, 591.31it/s, center_shift=0.000000, iteration=32, tol=0.000100]


In [21]:
training_results = {
    'means': means,
    'std_dev': std_dev,
    'labels': labels,
    'centers': cluster_centers,
}

# print(training_results)

school = {
    'name': 'null',
    'lat': -35,
    'lon': 117,
    'num_student': 977,
    'atar': 2000,
    'icsea': 1000,
}
schools = [school, school]
predict_array = process(schools)
predict_array = normalize(
    predict_array, 
    training_results['means'], 
    training_results['std_dev'], 
    'means_filling'
)

predict_tensor = torch.tensor(predict_array)

cluster_ids_y = kmeans_predict(predict_tensor, cluster_centers, 'euclidean')

print(cluster_ids_y)
for i in range(labels.shape[0]):
    if cluster_ids_y[0] == labels[i]:
        print(labels[i], school_list[i])

[-5.11692093e-01  8.56222924e-01  8.92163665e-01 -4.49097068e-01
  7.05739812e+02  9.99557252e+02  7.75741007e+01]
427.4300898319394
predicting on cpu..
tensor([0, 0])
tensor(0) {'name': 'ALBANY SENIOR HIGH SCHOOL', 'lat': -35.02044765, 'lon': 117.8917733, 'num_student': 977, 'atar': 72.8, 'icsea': 1010}
tensor(0) {'name': 'BELMONT CITY COLLEGE', 'lat': -31.95889136, 'lon': 115.9344344, 'num_student': 908, 'atar': 80.85, 'icsea': 968}
tensor(0) {'name': 'BELRIDGE SECONDARY COLLEGE', 'lat': -31.77038385, 'lon': 115.7631759, 'num_student': 1015, 'atar': 72.55, 'icsea': 1015}
tensor(0) {'name': 'BULLSBROOK COLLEGE', 'lat': -31.66171848, 'lon': 116.033179, 'num_student': 961, 'atar': -1, 'icsea': 979}
tensor(0) {'name': 'BUNBURY CATHOLIC COLLEGE', 'lat': -33.34342203, 'lon': 115.6614239, 'num_student': 826, 'atar': 78.95, 'icsea': 1035}
tensor(0) {'name': 'BUNBURY SENIOR HIGH SCHOOL', 'lat': -33.32746917, 'lon': 115.6326077, 'num_student': 979, 'atar': 77.3, 'icsea': 1019}
tensor(0) {'name