In [26]:
import numpy as np
import pandas as pd
import math

In [3]:
def pairwise_dist(x, y):  # [5 pts]
    """
        Args:
            x: N x D numpy array
            y: M x D numpy array
        Return:
                dist: N x M array, where dist2[i, j] is the euclidean distance between
                x[i, :] and y[j, :]
    """
    #raise NotImplementedError
    return np.linalg.norm(x[:, None, :] - y[None, :, :], axis=-1)

In [4]:
cities = pd.read_csv("data/gdata_standardized.csv")

In [17]:
meta_columns = ['Unnamed: 0', 'State Abbreviation', 'City', 'County Name', 'State', 'County', 'lat', 'lng']
city_metadata = cities[meta_columns]
city_metadata = city_metadata.rename(columns = {'Unnamed: 0':'city_id'})
city_metadata.head()

Unnamed: 0,city_id,State Abbreviation,City,County Name,State,County,lat,lng
0,0,AL,Arab,Marshall County,Alabama,Marshall County,34.3309,-86.4991
1,1,AL,Attalla,Etowah County,Alabama,Etowah County,34.005,-86.1039
2,2,AL,Auburn,Lee County,Alabama,Lee County,47.3039,-122.2108
3,3,AL,Auburn,Lee County,Alabama,Lee County,32.6087,-85.4903
4,4,AL,Auburn,Lee County,Alabama,Lee County,42.9338,-76.5685


In [38]:
if False:
    city_metadata[['city_id', 'City', 'State', 'County', 'State Abbreviation', 'lat', 'lng']].to_csv('data/city_metadata.csv')
if True:
    print(pd.read_csv('data/city_metadata.csv').head())

   id  city_id     City    State           County State Abbreviation      lat  \
0   0        0     Arab  Alabama  Marshall County                 AL  34.3309   
1   1        1  Attalla  Alabama    Etowah County                 AL  34.0050   
2   2        2   Auburn  Alabama       Lee County                 AL  47.3039   
3   3        3   Auburn  Alabama       Lee County                 AL  32.6087   
4   4        4   Auburn  Alabama       Lee County                 AL  42.9338   

        lng  
0  -86.4991  
1  -86.1039  
2 -122.2108  
3  -85.4903  
4  -76.5685  


In [43]:
def id_to_city(city_id):
    return pd.read_csv('data/city_metadata.csv').iloc[[city_id]]
#id_to_city(5)

In [6]:
city_df = cities.drop(columns = meta_columns)
city_df.head()

Unnamed: 0,Average Household Income 2020/2021,"Percent Change in Household Income, 1984-2019","Average Yearly Percent Change in Household Income, 1984-2019",MinWage,"Percent Change in Housing Prices, 2000-2019","Average Yearly Percent Change in Housing Prices, 2000-2019",Average House Price 2020/2021,Violent Crime Total,Property Crime Total,Property Crime Per Capita,...,Transit,Walk,OtherTransp,WorkAtHome,MeanCommute,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
0,-1.244475,0.682348,0.682348,-1.13437,-1.424617,-1.424617,-0.530148,-0.106718,-0.127941,1.084368,...,-0.594826,-0.742922,-1.335937,-1.240936,-0.374992,-0.154439,0.004864,0.328125,0.04123,-0.018375
1,-1.244475,0.682348,0.682348,-1.13437,-0.357814,-0.357814,-0.754485,-0.125265,-0.149551,1.143645,...,-0.596985,-1.132559,-0.880312,-0.555329,-0.708592,0.379059,-0.150118,-0.545845,0.117471,0.980909
2,-1.244475,0.682348,0.682348,-1.13437,-0.324323,-0.324323,-0.237935,-0.070466,0.023579,-0.302775,...,-0.45878,0.425989,-0.465186,-0.974311,-0.977773,-0.483073,1.150718,-1.026759,-0.416213,-0.200487
3,-1.244475,0.682348,0.682348,-1.13437,-0.324323,-0.324323,-0.237935,-0.070466,0.023579,-0.302775,...,-0.45878,0.425989,-0.465186,-0.974311,-0.977773,-0.483073,1.150718,-1.026759,-0.416213,-0.200487
4,-1.244475,0.682348,0.682348,-1.13437,-0.324323,-0.324323,-0.237935,-0.070466,0.023579,-0.302775,...,-0.45878,0.425989,-0.465186,-0.974311,-0.977773,-0.483073,1.150718,-1.026759,-0.416213,-0.200487


In [7]:
city_columns = list(enumerate(city_df.columns))
len(city_columns)

44

In [8]:
city_data = np.array(city_df)
print(len(city_data))
city_data[:1, :]

16584


array([[-1.24447506,  0.682348  ,  0.682348  , -1.13437038, -1.42461722,
        -1.42461722, -0.53014839, -0.1067177 , -0.12794116,  1.08436765,
         0.59758624,  1.07033325, -0.43784239, -0.43645572, -0.43915686,
        -0.46518461, -0.45130875, -0.08653261,  0.68502292, -0.66791656,
        -0.14496326, -0.67293528, -0.30617702, -1.1345026 , -0.9849128 ,
         1.12373331,  1.55060446, -0.86210883, -0.93632036, -0.71552672,
         1.0156942 ,  1.49140623,  0.94758088,  0.77191128, -0.59482555,
        -0.74292187, -1.33593746, -1.24093599, -0.37499169, -0.15443879,
         0.00486432,  0.32812539,  0.04123045, -0.01837476]])

In [35]:
def get_nn(city_id, n = 5):
    """
        Args:
            city_id: int
            n: int
        Return:
            nearest_neighbors: (n,) ndarray, where nearest_neighbors[i] is the city_id of the i'th most similar city
    """
    rows_per_files = 100
    start_index = city_id - city_id % rows_per_files
    end_index = start_index + rows_per_files
    address = 'data/city_distances/city_' + str(start_index) + '_' + str(end_index) + '.npy'
    dta = np.load(address, allow_pickle = True)
    #dta = np.genfromtxt(address, delimiter = ',')
    this_index = city_id % rows_per_files
    d = dta[this_index, :n].astype(int)
    return d
#get_nn(15780, 5)

array([15781, 15780, 15784, 15782, 15783])

In [31]:
#Only change to True if you want to recalculate distance csvs
if True:
    rows = city_data.shape[0]
    rows_per_file = 100
    files = int(rows/rows_per_file)
    for i in range(files):
        start_index = i * rows_per_file
        end_index = (i+1) * rows_per_file
        address = 'data/city_distances/city_' + str(start_index) + '_' + str(end_index)
        one_city = city_data[start_index:end_index, :]
        d = pairwise_dist(one_city, city_data)
        smallest_to_largest = np.argsort(d)
        #smallest_to_largest.tofile(address, sep = ',')
        np.save(address, smallest_to_largest, allow_pickle = True)
    address = 'data/city_distances/city_' + str(rows_per_file * (files)) + '_' + str(rows_per_file * (files + 1))
    one_city = city_data[start_index:end_index, :]
    d = pairwise_dist(one_city, city_data)
    smallest_to_largest = np.argsort(d)

    #smallest_to_largest.tofile(address, sep = ',')

    np.save(address, smallest_to_largest, allow_pickle = True)
    del address
    del one_city
    del d
    del smallest_to_largest

In [28]:
city_data.shape[0]

16584