In [11]:
from pymongo import MongoClient
from collections import OrderedDict

In [2]:
client = MongoClient("mongodb://localhost:27017")
client

MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True)

In [3]:
db = client.db_test

# Remember: Queries have implicit stages

In [7]:
cursor = db.laureates.find(filter = {'bornCountry':'USA'},projection = {'prizes.year':1},limit = 3)
for doc in cursor:
    print(doc["prizes"])

[{'year': '1972'}]
[{'year': '1989'}]
[{'year': '1990'}]


# Enter the pipeline

In [9]:
pipeline = [
{'$match': {'bornCountry':"USA"}},
{'$project': {'prizes.year':1}},
{'$limit':3}
]
cursor = db.laureates.aggregate(pipeline)
for doc in cursor:
    print(doc["prizes"])

[{'year': '1972'}]
[{'year': '1989'}]
[{'year': '1990'}]


# Adding new states to the pipeline

In [13]:
pipeline = [
{'$match': {'bornCountry':"USA"}},
{'$project': {'prizes.year':1,'_id':0}},
{'$sort':OrderedDict([("prizes.year", 1)])},
{'$skip':1},
{'$limit':3}
]
cursor = db.laureates.aggregate(pipeline)
for doc in cursor:
    print(doc["prizes"])

[{'year': '1912'}]
[{'year': '1914'}]
[{'year': '1919'}]


# Counting in a aggregation

In [16]:
tag_name = "n_USA-born-laureates"
list(db.laureates.aggregate([
{"$match": {"bornCountry": "USA"}},
{"$count": tag_name}
]))

[{'n_USA-born-laureates': 269}]

# Operation expressions. Please refer to the [link](https://www.mongodb.com/docs/manual/reference/aggregation-quick-reference/#std-label-aggregation-expressions)

In [17]:
db.laureates.aggregate([
{"$project": {"n_prizes": {"$size": "$prizes"}}}
]).next()

{'_id': ObjectId('65550468d1fc86246b056dae'), 'n_prizes': 1}

In [41]:
alias = 'n_prizes'
db.laureates.aggregate([
{"$project": {alias: {"$size": ["$prizes"]}}}
]).next()

{'_id': ObjectId('65550468d1fc86246b056dae'), 'n_prizes': 1}

In [20]:
db.laureates.aggregate([
{"$project": {"solo_winner": {"$in": ["1", "$prizes.share"]}}}
]).next()

{'_id': ObjectId('65550468d1fc86246b056dae'), 'solo_winner': False}

# Implementing distinct with aggregations

In [21]:
list_1 = db.laureates.distinct("bornCountry")
list_2 = [doc["_id"] for doc in db.laureates.aggregate([
{"$group": {"_id": "$bornCountry"}}
])]
set(list_2) - {None} == set(list_1)

True

In [22]:
# In general, "_id":None refers to a case with global aggregations
list(db.laureates.aggregate([
{"$project": {"n_prizes": {"$size": "$prizes"}}},
{"$group": {"_id": None, "n_prizes_total": {"$sum": "$n_prizes"}}}
]))

[{'_id': None, 'n_prizes_total': 941}]

# Desconstructing arrays into more documents with $unwind

## A pipeline without $unwind

In [23]:
pipeline = [
{"$project": {
"_id": 0, "year": 1, "category": 1,
"laureates.surname": 1, "laureates.share": 1}},
{"$limit": 3}
]
list(db.prizes.aggregate(pipeline))

[{'year': '2018',
  'category': 'medicine',
  'laureates': [{'surname': 'Allison', 'share': '2'},
   {'surname': 'Honjo', 'share': '2'}]},
 {'year': '2014',
  'category': 'literature',
  'laureates': [{'surname': 'Modiano', 'share': '1'}]},
 {'year': '2018',
  'category': 'physics',
  'laureates': [{'surname': 'Ashkin', 'share': '2'},
   {'surname': 'Mourou', 'share': '4'},
   {'surname': 'Strickland', 'share': '4'}]}]

## A pipeline with $unwind

In [24]:
pipeline = [
{"$unwind": "$laureates"},
{"$project": {
"_id": 0, "year": 1, "category": 1,
"laureates.surname": 1, "laureates.share": 1}},
{"$limit": 3}
]
list(db.prizes.aggregate(pipeline))

[{'year': '2018',
  'category': 'medicine',
  'laureates': {'surname': 'Allison', 'share': '2'}},
 {'year': '2018',
  'category': 'medicine',
  'laureates': {'surname': 'Honjo', 'share': '2'}},
 {'year': '2014',
  'category': 'literature',
  'laureates': {'surname': 'Modiano', 'share': '1'}}]

## Other example

In [38]:
pipeline_without = [
{"$project": {"year": 1, "category": 1, "laureates.id": 1}},
{"$group": {"_id": {"$concat": ["$category", ":", "$year"]},
"laureate_ids": {"$addToSet": "$laureates.id"}}},
{'$sort' : {'_id': -1}},
{"$limit": 5}
]
pipeline_with = [
{"$unwind": "$laureates"},
{"$project": {"year": 1, "category": 1, "laureates.id": 1}},
{"$group": {"_id": {"$concat": ["$category", ":", "$year"]},
"laureate_ids": {"$addToSet": "$laureates.id"}}},
{'$sort' : {'_id': -1}},
{"$limit": 5}
]

In [39]:
list(db.prizes.aggregate(pipeline_without))

[{'_id': 'physics:2018', 'laureate_ids': [['960', '961', '962']]},
 {'_id': 'physics:2017', 'laureate_ids': [['941', '942', '943']]},
 {'_id': 'physics:2016', 'laureate_ids': [['928', '929', '930']]},
 {'_id': 'physics:2015', 'laureate_ids': [['919', '920']]},
 {'_id': 'physics:2014', 'laureate_ids': [['906', '907', '908']]}]

In [40]:
list(db.prizes.aggregate(pipeline_with))

[{'_id': 'physics:2018', 'laureate_ids': ['962', '960', '961']},
 {'_id': 'physics:2017', 'laureate_ids': ['941', '942', '943']},
 {'_id': 'physics:2016', 'laureate_ids': ['928', '929', '930']},
 {'_id': 'physics:2015', 'laureate_ids': ['920', '919']},
 {'_id': 'physics:2014', 'laureate_ids': ['906', '907', '908']}]

## Counting with $unwind

### Is possible without unwind

In [42]:
list(db.prizes.aggregate([
{"$project": {"n_laureates": {"$size": "$laureates"}, "category": 1}},
{"$group": {"_id": "$category", "n_laureates": {"$sum": "$n_laureates"}}},
{"$sort": {"n_laureates": -1}},
]))

[{'_id': 'medicine', 'n_laureates': 216},
 {'_id': 'physics', 'n_laureates': 210},
 {'_id': 'chemistry', 'n_laureates': 181},
 {'_id': 'peace', 'n_laureates': 133},
 {'_id': 'literature', 'n_laureates': 114},
 {'_id': 'economics', 'n_laureates': 81}]

# ...but it turns out that unwind is a less verbose solution

In [47]:
list(db.prizes.aggregate([
{"$unwind": "$laureates"},
{"$group": {"_id": "$category", "n_laureates": {"$sum": 1}}},
{"$sort": {"n_laureates": -1}},
]))

[{'_id': 'medicine', 'n_laureates': 216},
 {'_id': 'physics', 'n_laureates': 210},
 {'_id': 'chemistry', 'n_laureates': 181},
 {'_id': 'peace', 'n_laureates': 133},
 {'_id': 'literature', 'n_laureates': 114},
 {'_id': 'economics', 'n_laureates': 81}]

# JOIN in NoSQL with $lookup

## basic example

In [54]:
db.orders.insert_many( [
   { "_id" : 1, "item" : "almonds", "price" : 12, "quantity" : 2 },
   { "_id" : 2, "item" : "pecans", "price" : 20, "quantity" : 1 },
   { "_id" : 3  }
] )
db.inventory.insert_many( [
   { "_id" : 1, "sku" : "almonds", "description": "product 1", "instock" : 120 },
   { "_id" : 2, "sku" : "bread", "description": "product 2", "instock" : 80 },
   { "_id" : 3, "sku" : "cashews", "description": "product 3", "instock" : 60 },
   { "_id" : 4, "sku" : "pecans", "description": "product 4", "instock" : 70 },
   { "_id" : 5, "sku": None, "description": "Incomplete" },
   { "_id" : 6 }
] )

InsertManyResult([1, 2, 3, 4, 5, 6], acknowledged=True)

In [56]:
cursor = db.orders.aggregate( [
   {
     "$lookup":
       {
         "from": "inventory",
         "localField": "item",
         "foreignField": "sku",
         "as": "inventory_docs"
       }
  }
])
cursor.next()

{'_id': 1,
 'item': 'almonds',
 'price': 12,
 'quantity': 2,
 'inventory_docs': [{'_id': 1,
   'sku': 'almonds',
   'description': 'product 1',
   'instock': 120}]}

In [57]:
db.orders.drop()
db.inventory.drop()

## Going back to the aggregates example

In [59]:
pipeline = [
{"$match": {"category": "economics"}},
{"$unwind": "$laureates"},
{"$lookup": {"from": "laureates", "foreignField": "id",
"localField": "laureates.id", "as": "laureate_bios"}},
{"$unwind": "$laureate_bios"},
{"$group": {"_id": None,
"bornCountries":
{"$addToSet": "$laureate_bios.bornCountry"}
}},
]
list(db.prizes.aggregate(pipeline))

[{'_id': None,
  'bornCountries': ['Hungary',
   'British West Indies (now Saint Lucia)',
   'Russia',
   'Canada',
   'Austria',
   'Italy',
   'Finland',
   'British Mandate of Palestine (now Israel)',
   'USA',
   'Germany (now Poland)',
   'Scotland',
   'Russian Empire (now Russia)',
   'Norway',
   'India',
   'Russian Empire (now Belarus)',
   'the Netherlands',
   'Sweden',
   'Cyprus',
   'Germany',
   'France',
   'United Kingdom']}]

## Decomposing the pipeline

### At first laureates is a document list

In [64]:
pipeline = [
{"$match": {"category": "economics"}},
]
list(db.prizes.aggregate(pipeline))[0]

{'_id': ObjectId('6555048619e46de1de2d02f8'),
 'year': '1969',
 'category': 'economics',
 'laureates': [{'id': '677',
   'firstname': 'Ragnar',
   'surname': 'Frisch',
   'motivation': '"for having developed and applied dynamic models for the analysis of economic processes"',
   'share': '2'},
  {'id': '678',
   'firstname': 'Jan',
   'surname': 'Tinbergen',
   'motivation': '"for having developed and applied dynamic models for the analysis of economic processes"',
   'share': '2'}]}

### We use unwind here to split those documents with laureates array into other documents containing the same info above this field and one info per laureate field

In [65]:
pipeline = [
{"$match": {"category": "economics"}},
{"$unwind": "$laureates"},
]
list(db.prizes.aggregate(pipeline))[0]

{'_id': ObjectId('6555048619e46de1de2d02f8'),
 'year': '1969',
 'category': 'economics',
 'laureates': {'id': '677',
  'firstname': 'Ragnar',
  'surname': 'Frisch',
  'motivation': '"for having developed and applied dynamic models for the analysis of economic processes"',
  'share': '2'}}

### With laureates field accessible through laureates.id, now is possible to join laureates and prizes collections

In [66]:
pipeline = [
{"$match": {"category": "economics"}},
{"$unwind": "$laureates"},
{"$lookup": {"from": "laureates", "foreignField": "id",
"localField": "laureates.id", "as": "laureate_bios"}}
]
list(db.prizes.aggregate(pipeline))[0]

{'_id': ObjectId('6555048619e46de1de2d02f8'),
 'year': '1969',
 'category': 'economics',
 'laureates': {'id': '677',
  'firstname': 'Ragnar',
  'surname': 'Frisch',
  'motivation': '"for having developed and applied dynamic models for the analysis of economic processes"',
  'share': '2'},
 'laureate_bios': [{'_id': ObjectId('65550468d1fc86246b056f9b'),
   'id': '677',
   'firstname': 'Ragnar',
   'surname': 'Frisch',
   'born': '1895-03-03',
   'died': '1973-01-31',
   'bornCountry': 'Norway',
   'bornCountryCode': 'NO',
   'bornCity': 'Oslo',
   'diedCountry': 'Norway',
   'diedCountryCode': 'NO',
   'diedCity': 'Oslo',
   'gender': 'male',
   'prizes': [{'year': '1969',
     'category': 'economics',
     'share': '2',
     'motivation': '"for having developed and applied dynamic models for the analysis of economic processes"',
     'affiliations': [{'name': 'University of Oslo',
       'city': 'Oslo',
       'country': 'Norway'}]}]}]}

### Now, laureate_bios is protected by a list. Let's pass through it using unwind again 

In [67]:
pipeline = [
{"$match": {"category": "economics"}},
{"$unwind": "$laureates"},
{"$lookup": {"from": "laureates", "foreignField": "id",
"localField": "laureates.id", "as": "laureate_bios"}},
{"$unwind": "$laureate_bios"}
]
list(db.prizes.aggregate(pipeline))[0]

{'_id': ObjectId('6555048619e46de1de2d02f8'),
 'year': '1969',
 'category': 'economics',
 'laureates': {'id': '677',
  'firstname': 'Ragnar',
  'surname': 'Frisch',
  'motivation': '"for having developed and applied dynamic models for the analysis of economic processes"',
  'share': '2'},
 'laureate_bios': {'_id': ObjectId('65550468d1fc86246b056f9b'),
  'id': '677',
  'firstname': 'Ragnar',
  'surname': 'Frisch',
  'born': '1895-03-03',
  'died': '1973-01-31',
  'bornCountry': 'Norway',
  'bornCountryCode': 'NO',
  'bornCity': 'Oslo',
  'diedCountry': 'Norway',
  'diedCountryCode': 'NO',
  'diedCity': 'Oslo',
  'gender': 'male',
  'prizes': [{'year': '1969',
    'category': 'economics',
    'share': '2',
    'motivation': '"for having developed and applied dynamic models for the analysis of economic processes"',
    'affiliations': [{'name': 'University of Oslo',
      'city': 'Oslo',
      'country': 'Norway'}]}]}}

### Finally, we group the needed information with a final group appending bornCountries with addToSet

In [68]:
pipeline = [
{"$match": {"category": "economics"}},
{"$unwind": "$laureates"},
{"$lookup": {"from": "laureates", "foreignField": "id",
"localField": "laureates.id", "as": "laureate_bios"}},
{"$unwind": "$laureate_bios"},
{"$group": {"_id": None,
"bornCountries":
{"$addToSet": "$laureate_bios.bornCountry"}
}},
]
list(db.prizes.aggregate(pipeline))

[{'_id': None,
  'bornCountries': ['the Netherlands',
   'Russian Empire (now Belarus)',
   'British Mandate of Palestine (now Israel)',
   'India',
   'Norway',
   'Russian Empire (now Russia)',
   'United Kingdom',
   'Hungary',
   'France',
   'Canada',
   'Germany',
   'Cyprus',
   'Sweden',
   'Finland',
   'Italy',
   'Austria',
   'Russia',
   'British West Indies (now Saint Lucia)',
   'Scotland',
   'Germany (now Poland)',
   'USA']}]

# Adding new fields with $addFields

### Padronizing years without month and day

In [71]:
pipeline = [
{"$match": {"died": {"$gt": "1700"}, "born": {"$gt": "1700"}}},
{"$addFields": {"bornArray": {"$split": ["$born", "-"]},
"diedArray": {"$split": ["$died", "-"]}}},
{"$addFields": {"born": {"$cond": [
{"$in": ["00", "$bornArray"]},
{"$concat": [{"$arrayElemAt": ["$bornArray", 0]}, "-01-01"]},
"$born"
]}}},
{"$project": {"died": {"$dateFromString": {"dateString": "$died"}},
"born": {"$dateFromString": {"dateString": "$born"}},
"_id": 0}}
]
docs = list(db.laureates.aggregate(pipeline))
print(docs[0])

{'died': datetime.datetime(1984, 10, 14, 0, 0), 'born': datetime.datetime(1918, 9, 27, 0, 0)}


## A $bucket list

In [76]:
pipeline = [{"$match": {"died": {"$gt": "1700"}, "born": {"$gt": "1700"}}},
{"$addFields": {"bornArray": {"$split": ["$born", "-"]},
"diedArray": {"$split": ["$died", "-"]}}},
{"$addFields": {"born": {"$cond": [
{"$in": ["00", "$bornArray"]},
{"$concat": [{"$arrayElemAt": ["$bornArray", 0]}, "-01-01"]},
"$born"
]}}},
{"$project": {"died": {"$dateFromString": {"dateString": "$died"}},
"born": {"$dateFromString": {"dateString": "$born"}}}},
{"$project": {"years": {"$floor": {"$divide": [
{"$subtract": ["$died", "$born"]},
31557600000 # 1000 * 60 * 60 * 24 * 365.25
]}}}},
{"$bucket": {"groupBy": "$years",
"boundaries": list(range(30, 120, 10))}}
]
docs = list(db.laureates.aggregate(pipeline))
print(docs)

[{'_id': 30, 'count': 1}, {'_id': 40, 'count': 6}, {'_id': 50, 'count': 21}, {'_id': 60, 'count': 87}, {'_id': 70, 'count': 154}, {'_id': 80, 'count': 221}, {'_id': 90, 'count': 115}, {'_id': 100, 'count': 2}]


# Exercise: do as above and decompose all the unexplained pipelines explaining what have just happened and reading the documentation when decomposing the pipeline