### This file computes the distance between several urban clusters ('urban_clusters.csv')  and each houses. This operation is performed for the training and testing set. Once all the distances computed, we created 2 new csv files called (one for the training set and one for the test set) with the id of each houses and a binary outcome. The binary outcome is determined (arbitrarly) and ouputs a 1 if a house is located at less than 10km from an urban cluster and 0 otherwise.

### Unfortunately, considering the size of the file (3600 clusters), the number of computations is too high. Indeed, if we consider the training set (20000 observations), the loop has to run more than 60'000'000 times. We tried multiple times but, our computers were crashing after more or less 2'000'000 computations ( memory usage was skyrocketing ). We could have used some not so free cloud platform but we did not.


In [1]:
import pandas as pd
import numpy as np
from geopy import distance

In [3]:
# Load the training and test data
train = pd.read_csv('train.csv', index_col='Uniq Id')
test = pd.read_csv('test.csv', index_col='Uniq Id')

In [4]:
urban_clusters = pd.read_csv('urban_clusters_2022.txt',sep='\t')

In [5]:
urban_clusters

Unnamed: 0,GEOID,NAME,UATYPE,ALAND,AWATER,ALAND_SQMI,AWATER_SQMI,INTPTLAT,INTPTLONG
0,37,"Abbeville, LA Urban Cluster",C,29057340,426405,11.219,0.165,29.967156,-92.095966
1,64,"Abbeville, SC Urban Cluster",C,11271136,19786,4.352,0.008,34.179273,-82.379776
2,91,"Abbotsford, WI Urban Cluster",C,5426586,13221,2.095,0.005,44.948612,-90.315875
3,118,"Aberdeen, MS Urban Cluster",C,7416338,52820,2.863,0.020,33.824742,-88.554591
4,145,"Aberdeen, SD Urban Cluster",C,33032902,120864,12.754,0.047,45.463186,-98.471033
...,...,...,...,...,...,...,...,...,...
3596,98101,"Zapata--Medina, TX Urban Cluster",C,13451264,0,5.194,0.000,26.889081,-99.266192
3597,98182,"Zephyrhills, FL Urbanized Area",U,112593842,1615599,43.473,0.624,28.285373,-82.198969
3598,98209,"Zimmerman, MN Urban Cluster",C,24456008,2495147,9.443,0.963,45.455850,-93.606705
3599,98236,"Zumbrota, MN Urban Cluster",C,4829469,0,1.865,0.000,44.292793,-92.670931


In [6]:
train['lat_long'] = list(zip(train.Latitude, train.Longitude))
test['lat_long'] = list(zip(test.Latitude, test.Longitude))

In [7]:
urban_clusters.columns

Index(['GEOID', 'NAME', 'UATYPE', 'ALAND', 'AWATER', 'ALAND_SQMI',
       'AWATER_SQMI', 'INTPTLAT',
       'INTPTLONG                                                                                                                      '],
      dtype='object')

In [8]:
urban_clusters['lat_long'] = list(zip(urban_clusters.INTPTLAT, urban_clusters['INTPTLONG                                                                                                                      ']))

In [9]:
urban_clusters

Unnamed: 0,GEOID,NAME,UATYPE,ALAND,AWATER,ALAND_SQMI,AWATER_SQMI,INTPTLAT,INTPTLONG,lat_long
0,37,"Abbeville, LA Urban Cluster",C,29057340,426405,11.219,0.165,29.967156,-92.095966,"(29.967156, -92.095966)"
1,64,"Abbeville, SC Urban Cluster",C,11271136,19786,4.352,0.008,34.179273,-82.379776,"(34.179273, -82.379776)"
2,91,"Abbotsford, WI Urban Cluster",C,5426586,13221,2.095,0.005,44.948612,-90.315875,"(44.948612, -90.315875)"
3,118,"Aberdeen, MS Urban Cluster",C,7416338,52820,2.863,0.020,33.824742,-88.554591,"(33.824742, -88.554591)"
4,145,"Aberdeen, SD Urban Cluster",C,33032902,120864,12.754,0.047,45.463186,-98.471033,"(45.463186, -98.471033)"
...,...,...,...,...,...,...,...,...,...,...
3596,98101,"Zapata--Medina, TX Urban Cluster",C,13451264,0,5.194,0.000,26.889081,-99.266192,"(26.889081, -99.266192)"
3597,98182,"Zephyrhills, FL Urbanized Area",U,112593842,1615599,43.473,0.624,28.285373,-82.198969,"(28.285373, -82.198969)"
3598,98209,"Zimmerman, MN Urban Cluster",C,24456008,2495147,9.443,0.963,45.455850,-93.606705,"(45.45585, -93.606705)"
3599,98236,"Zumbrota, MN Urban Cluster",C,4829469,0,1.865,0.000,44.292793,-92.670931,"(44.292793, -92.670931)"


In [10]:
train_test = pd.read_csv('train.csv')
urban_clusters_test = pd.read_csv('urban_clusters_2022.txt',sep='\t')

In [11]:
df1 = train_test[['Uniq Id', 'Latitude','Longitude']]
df2 = urban_clusters_test[['GEOID', 'INTPTLAT','INTPTLONG                                                                                                                      ']]

In [12]:
df1 = df1.rename(columns = {'Uniq Id': 'id'})
df2 = df2.rename(columns = {'GEOID': 'id'})

In [13]:
dfm = pd.merge(df1, df2, how = 'cross', suffixes = ['','_2'])

In [14]:
dfm.head(5)

Unnamed: 0,id,Latitude,Longitude,id_2,INTPTLAT,INTPTLONG
0,d1051058987a9318579c5f4800b11ec9,29.775803,-95.56353,37,29.967156,-92.095966
1,d1051058987a9318579c5f4800b11ec9,29.775803,-95.56353,64,34.179273,-82.379776
2,d1051058987a9318579c5f4800b11ec9,29.775803,-95.56353,91,44.948612,-90.315875
3,d1051058987a9318579c5f4800b11ec9,29.775803,-95.56353,118,33.824742,-88.554591
4,d1051058987a9318579c5f4800b11ec9,29.775803,-95.56353,145,45.463186,-98.471033


In [None]:
dfm['dist'] = dfm.apply(lambda r: distance.distance((r['Latitude'],r['Longitude']),(r['INTPTLAT'],r['INTPTLONG'])).km * 1000 , axis=1)