## Creating the 2dsphere index

In [9]:
from pymongo import MongoClient
client = MongoClient()
companies = client.companies.companies
offices = client.companies.offices

In [10]:
# To create the geo index, we must first prepare our data.

## Separate different offices into different documents
unwind = {'$unwind': {
            'path': '$offices'}}
## Remove documents with null values for either latitude or longitude
dropnull = {'$match': {
            'offices.latitude': {
                '$ne': None}, 
            'offices.longitude': {
                '$ne': None}}}
## Create new attribute in geoJSON format
geojson = {'$set': {
            'geo_coord': {
                'type': 'Point', 
                'coordinates': [
                '$offices.longitude', '$offices.latitude']}
                    }}
## Remove the _id
### If we try to insert the documents as is, we will have a problem, since
### we would be inserting some documents with duplicated _id.
remove_id = {'$project': {'_id': 0}}
## This is a pipeline, all the stages will be executed one after the other.

pipeline = [unwind, dropnull, geojson, remove_id]

In [11]:
## We use this pipeline in an aggregation
result = companies.aggregate(pipeline)
result

<pymongo.command_cursor.CommandCursor at 0x100d65550>

In [12]:
res = offices.insert_many(result)

In [13]:
res.inserted_ids[:3]

[ObjectId('60211110a1919d6139ffe75f'),
 ObjectId('60211110a1919d6139ffe760'),
 ObjectId('60211110a1919d6139ffe761')]

In [14]:
len(list(result))

0

### Create 2dsphere index

In [15]:
offices.create_index([("geo_coord","2dsphere")])

'geo_coord_2dsphere'

### Adding more operations to pipeline

In [16]:
# To create the geo index, we must first prepare our data.

## Separate different offices into different documents
unwind = {'$unwind': {
            'path': '$offices'}}
## Remove documents with null values for either latitude or longitude
dropnull = {'$match': {
            'offices.latitude': {
                '$ne': None}, 
            'offices.longitude': {
                '$ne': None}}}
## Create new attribute in geoJSON format
geojson = {'$set': {
            'geo_coord': {
                'type': 'Point', 
                'coordinates': [
                '$offices.longitude', '$offices.latitude']}
                    }}
## Remove the _id
### If we try to insert the documents as is, we will have a problem, since
### we would be inserting some documents with duplicated _id.
remove_id = {'$project': {'_id': 0}}
## filter
filt_design = {"$match":
                  {"tag_list":{"$regex":".*[Dd]esign.*"}}
              }
## This is a pipeline, all the stages will be executed one after the other.

pipeline = [unwind, dropnull, geojson, remove_id, filt_design]

In [17]:
result = companies.aggregate(pipeline)

In [18]:
data = list(result)
len(data)

461

In [19]:
data[0]["tag_list"]

'community, social, news, bookmark, digg, technology, design'

## Querying
Once we have a new collection with the data prepared, we can started  querying

In [20]:
design = offices.find({"tag_list":{"$regex":".*[Dd]esign.*"}})

In [21]:
len(list(design))

461

### near or geoNear operators
These are the query operators.

Not to be confused with the aggregator `$geoNear`

In [22]:
eiffel_tower = {"type":"Point", "coordinates":[2.2945,48.8584]}
query = {"geo_coord":
         {"$near": eiffel_tower, 
          # Optionally, we can set one or both maxDistance, minDistance (in meters)
          "$maxDistance":3000, "$minDistance":2000 }}
paris = offices.find(query)

In [23]:
paris = list(paris)
len(paris)

20

### Calculating Distances

#### Euclidean Distance
$A = x_A, y_A$

$B = x_B, y_B$

$dist_{AB} = \sqrt{(x_A-x_B)^2 + (y_A-y_B)^2}$

In [24]:
# Distance between points
def dist(pointA, pointB=(0,0)):
    dist = 0
    for i in range(len(pointA)):
        dist += (pointA[i] - pointB[i])**2
    return dist **.5

In [25]:
dist([7,8])

10.63014581273465

In [26]:
distances = []
for comp in paris:
    distances.append(dist(comp["geo_coord"]["coordinates"], eiffel_tower["coordinates"]))

In [27]:
# Distance in Degrees
distances

[0.027390855476418334,
 0.022701592527842505,
 0.023700403118931342,
 0.029152472365138238,
 0.029152472365138238,
 0.026771368063660744,
 0.026771368063660744,
 0.030529760943869808,
 0.030529760943869808,
 0.03218738299396029,
 0.028415663091507348,
 0.027476849200733577,
 0.03185180456944362,
 0.031119813555031114,
 0.024167320517591843,
 0.032385631813039994,
 0.03698658077262518,
 0.029769268013341586,
 0.03655933706319018,
 0.03193368166998561]

In [28]:
# Approximation to meters
# 1º ~ 111km (equator)
## This results in values that are outside our range, because the approximation
## is not good enough
[d*111000 for d in distances]

[3040.384957882435,
 2519.876770590518,
 2630.744746201379,
 3235.9244325303443,
 3235.9244325303443,
 2971.6218550663425,
 2971.6218550663425,
 3388.8034647695486,
 3388.8034647695486,
 3572.7995123295923,
 3154.1386031573156,
 3049.930261281427,
 3535.550307208242,
 3454.299304608454,
 2682.5725774526945,
 3594.8051312474395,
 4105.510465761395,
 3304.388749480916,
 4058.08641401411,
 3544.6386653684026]

#### Haversine

In [29]:
from haversine import haversine

In [30]:
distances = []
for comp in paris:
    distances.append(
        haversine(comp["geo_coord"]["coordinates"][::-1], 
                  eiffel_tower["coordinates"][::-1]))

`NOTE: When using haversine, we must always pass LATITUDE, LONGITUDE in this order.`

In [31]:
distances # in km

[2.0293801160751523,
 2.140384455030728,
 2.141014432265475,
 2.346288905121962,
 2.346288905121962,
 2.3823983619347975,
 2.3823983619347975,
 2.508083572808068,
 2.508083572808068,
 2.525561886851476,
 2.526874801632946,
 2.545316677795391,
 2.637376149745737,
 2.6497406932746252,
 2.678886539715555,
 2.71932026117995,
 2.7467492001750426,
 2.802389089446785,
 2.9149462188668926,
 2.9781518244257503]

#### $geoNear Aggregation

In [32]:
geoagg = {"$geoNear":{"near":{"type":"Point", 
                      "coordinates":[2.2945,48.8584] }, 
                      "distanceField":"dist",
                      "minDistance":2000,
                      "maxDistance":3000}}

In [33]:
result = offices.aggregate([geoagg])

In [34]:
data = list(result)

In [35]:
len(data)

20

In [36]:
data[0]["dist"]

2031.6389012581994

## Bonus - Progress Bar
- [Tutorial on Towards Data Science](https://towardsdatascience.com/ever-wanted-progress-bars-in-jupyter-bdb3988d9cfc)
- [tqdm Docs](https://github.com/tqdm/tqdm#manual)

In [37]:
from tqdm import tqdm

In [41]:
def calculate_distances(collection,point):
    dist = []
    for office in tqdm(collection, total=10834):
        dist.append(
            haversine(office["geo_coord"]["coordinates"][::-1],
                      point["coordinates"][::-1])
        )
    return dist

In [42]:
off = offices.find({})

In [43]:
dist = calculate_distances(off,eiffel_tower)

100%|██████████| 10834/10834 [00:02<00:00, 4089.98it/s]
