In [3]:
import numpy as np
import pandas as pd
import math

In [4]:
def pairwise_dist(x, y):  # [5 pts]
    """
        Args:
            x: N x D numpy array
            y: M x D numpy array
        Return:
                dist: N x M array, where dist2[i, j] is the euclidean distance between
                x[i, :] and y[j, :]
    """
    #raise NotImplementedError
    return np.linalg.norm(x[:, None, :] - y[None, :, :], axis=-1)

In [5]:
cities = pd.read_csv("data/data_standardized.csv")

In [6]:
meta_columns = ['Unnamed: 0', 'State Abbreviation', 'City', 'County Name', 'State', 'County']
city_metadata = cities[meta_columns]
city_metadata = city_metadata.rename(columns = {'Unnamed: 0':'city_id'})
city_metadata.head()

Unnamed: 0,city_id,State Abbreviation,City,County Name,State,County
0,0,AL,Arab,Marshall County,Alabama,Marshall County
1,1,AL,Attalla,Etowah County,Alabama,Etowah County
2,2,AL,Auburn,Lee County,Alabama,Lee County
3,3,AL,Bay Minette,Baldwin County,Alabama,Baldwin County
4,4,AL,Chickasaw,Mobile County,Alabama,Mobile County


In [7]:
if False:
    city_metadata[['city_id', 'City', 'State', 'County', 'State Abbreviation']].to_csv('data/city_metadata.csv', 
        index = False)
if True:
    print(pd.read_csv('data/city_metadata.csv').head())

   city_id         City    State           County State Abbreviation
0        0         Arab  Alabama  Marshall County                 AL
1        1      Attalla  Alabama    Etowah County                 AL
2        2       Auburn  Alabama       Lee County                 AL
3        3  Bay Minette  Alabama   Baldwin County                 AL
4        4    Chickasaw  Alabama    Mobile County                 AL


In [8]:
def id_to_city(city_id):
    return pd.read_csv('data/city_metadata.csv').iloc[[city_id]]
id_to_city(5)

Unnamed: 0,city_id,City,State,County,State Abbreviation
5,5,Citronelle,Alabama,Mobile County,AL


In [9]:
city_df = cities.drop(columns = meta_columns)
city_df.head()

Unnamed: 0,Average Household Income 2020/2021,"Percent Change in Household Income, 1984-2019","Average Yearly Percent Change in Household Income, 1984-2019",MinWage,"Percent Change in Housing Prices, 2000-2019","Average Yearly Percent Change in Housing Prices, 2000-2019",Average House Price 2020/2021,Violent Crime Total,Property Crime Total,Property Crime Per Capita,...,Transit,Walk,OtherTransp,WorkAtHome,MeanCommute,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
0,-1.244475,0.682348,0.682348,-1.13437,-1.424617,-1.424617,-0.530148,-0.106718,-0.127941,1.084368,...,-0.594826,-0.742922,-1.335937,-1.240936,-0.374992,-0.154439,0.004864,0.328125,0.04123,-0.018375
1,-1.244475,0.682348,0.682348,-1.13437,-0.357814,-0.357814,-0.754485,-0.125265,-0.149551,1.143645,...,-0.596985,-1.132559,-0.880312,-0.555329,-0.708592,0.379059,-0.150118,-0.545845,0.117471,0.980909
2,-1.244475,0.682348,0.682348,-1.13437,-0.324323,-0.324323,-0.237935,-0.070466,0.023579,-0.302775,...,-0.45878,0.425989,-0.465186,-0.974311,-0.977773,-0.483073,1.150718,-1.026759,-0.416213,-0.200487
3,-1.244475,0.682348,0.682348,-1.13437,-0.168164,-0.168164,-0.518289,-0.106718,-0.179453,-0.5354,...,-0.640174,-1.049909,-0.384185,0.619997,0.255398,0.152856,-0.256827,0.184776,-0.797416,-0.256522
4,-1.244475,0.682348,0.682348,-1.13437,-1.612573,-1.612573,-0.756796,-0.119364,-0.17493,0.081101,...,-0.411272,-0.630754,-0.617061,-0.582536,-0.448614,0.639406,-0.254287,-0.943524,0.269952,1.209716


In [10]:
city_columns = list(enumerate(city_df.columns))
len(city_columns)

44

In [11]:
city_np = np.array(city_df)
city_np.shape

(4155, 44)

In [20]:
cities_dist = None
cities_nn = None
if True:
    cities_dist = pairwise_dist(city_np, city_np) #.tofile('data/pairwise_city_dist.csv', sep = ',')
    #np.savetxt("data/city_pairwise_dist.csv", cities_dist, delimiter=",")
    temp = pd.DataFrame(cities_dist)
    temp.to_csv("data/city_distances.csv", index = False)
    cities_nn = pd.DataFrame(np.argsort(cities_dist))
    #np.savetxt("data/city_nn_ranked.csv", cities_nn, delimiter=",")
    cities_nn.to_csv("data/city_nn.csv", index = False)
    cities_dist = temp
    del temp
else:
    cities_dist = np.array(pd.read_csv("data/city_distances.csv"))
    cities_nn = np.array(pd.read_csv("data/city_nn.csv"))

In [21]:
np.sum(cities_nn, axis = 1)

0       8629935
1       8629935
2       8629935
3       8629935
4       8629935
         ...   
4150    8629935
4151    8629935
4152    8629935
4153    8629935
4154    8629935
Length: 4155, dtype: int64

In [22]:
def get_nn(city_id, n = 5):
    """
        Args:
            city_id: int
            n: int
        Return:
            nearest_neighbors: (n,) ndarray, where nearest_neighbors[i] is the city_id of the i'th most similar city
    """

    #Don't include the first column since it will always be itself (distances between a city and itself is 0 miles)
    dta = cities_nn.iloc[city_id, 1:n]

    return dta
    
get_nn(0, 5)

1    1344
2    2172
3    2089
4      15
Name: 0, dtype: int64

In [30]:
cities_nn.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4145,4146,4147,4148,4149,4150,4151,4152,4153,4154
0,0,1344,2172,2089,15,25,2154,3529,20,1320,...,399,2843,225,743,499,507,1168,3709,395,2733
1,1,3369,2149,2193,3346,3357,2156,3380,3340,2182,...,399,2843,743,499,225,507,1168,3709,395,2733
2,2,18,17,3441,1278,3590,3588,1371,3523,3572,...,359,743,499,225,2843,507,1168,3709,395,2733
3,3,12,11,8,19,3543,3544,3569,3535,3478,...,359,225,499,743,507,2843,1168,3709,395,2733
4,4,23,6,5,24,9,22,3413,3481,3442,...,399,743,2843,499,225,507,1168,3709,395,2733


In [35]:
city_metadata.columns

Index(['city_id', 'State Abbreviation', 'City', 'County Name', 'State',
       'County'],
      dtype='object')

In [38]:
nearest_neighbors = pd.concat([city_metadata, cities_nn.iloc[:, 1:11]], axis = 1)

nearest_neighbors['city_name'] = nearest_neighbors.apply(lambda x: x['City'] + ', ' + x['County'], axis = 1)

nearest_neighbors.to_csv('data/nearest_neighbors.csv', index = False)

In [42]:
# pd.DataFrame(pd.unique(city_metadata['State'])).to_csv('data/cities.csv', index = False)