In [1]:
from pymongo import MongoClient

In [2]:
client = MongoClient("mongodb://localhost:27017")
client

MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True)

In [3]:
assert 'db_test' in client.list_database_names()

# Getting the database

In [4]:
db = client.db_test

# Remembering the fields of laureates

In [5]:
db.laureates.find_one()

{'_id': ObjectId('65550468d1fc86246b056dae'),
 'id': '96',
 'firstname': 'John Robert',
 'surname': 'Schrieffer',
 'born': '1931-05-31',
 'died': '0000-00-00',
 'bornCountry': 'USA',
 'bornCountryCode': 'US',
 'bornCity': 'Oak Park, IL',
 'gender': 'male',
 'prizes': [{'year': '1972',
   'category': 'physics',
   'share': '3',
   'motivation': '"for their jointly developed theory of superconductivity, usually called the BCS-theory"',
   'affiliations': [{'name': 'University of Pennsylvania',
     'city': 'Philadelphia, PA',
     'country': 'USA'}]}]}

# Distinct use cases

## A simple case

In [6]:
db.laureates.distinct("gender")

['female', 'male', 'org']

## Distinct with dot notation

In [7]:
db.laureates.distinct("prizes.category")

['chemistry', 'economics', 'literature', 'medicine', 'peace', 'physics']

# Distinct with filters

## Find which cateogories have prize share in laureates collection

In [8]:
db.laureates.distinct("prizes.category")

['chemistry', 'economics', 'literature', 'medicine', 'peace', 'physics']

## Find the shares in the prizes collection

In [9]:
db.prizes.distinct("category", {"laureates.share": "4"})

['chemistry', 'medicine', 'physics']

# Find prize categories with multi-winners

## Checking if there is multi-winners

In [10]:
# prizes.1 is the second position of the prize array
db.laureates.count_documents({"prizes.1": {"$exists": True}})

6

## Finding the fields

In [11]:
db.laureates.distinct(
"prizes.category", {"prizes.1": {"$exists": True}})

['chemistry', 'peace', 'physics']

# Array fields and operators

## Finding how many laureates have in physics, chemistry and medicine

In [12]:
db.laureates.count_documents({
    'prizes.category': {
    '$in': ['physics','chemistry','medicine']
}
})

604

## Finding the complement

In [13]:
db.laureates.count_documents({
    'prizes.category': {
    '$nin': ['physics','chemistry','medicine']
}
})

330

# More complex queries with $elemMatch

In [14]:
# Finding single winners in physics
filter_dict = {"prizes": 
               {"$elemMatch":
                    {"category": "physics",
                     "share": "1"}
               }
              }
db.laureates.count_documents(filter_dict)

47

In [15]:
# finding multi-winners in physics
filter_dict = {"prizes": 
               {"$elemMatch":
                    {"category": "physics",
                     "share": {"$gt":"1"}}
               }
              }
db.laureates.count_documents(filter_dict)

162

In [16]:
# Finding single winners before 1945
filter_dict = {"prizes": 
               {"$elemMatch":
                    {"category": "physics",
                     "share": "1",
                     "year": {"$lt":"1945"}}
               }
              }
db.laureates.count_documents(filter_dict)

29

# Even more complex filters with regex

## Finding how many Curie has in the laureates collection

In [17]:
db.laureates.count_documents({"surname": {'$regex': 'Curie'}})

3

# Finding the names

In [18]:
[doc['firstname'] for doc in db.laureates.find({"surname": {'$regex': 'Curie'}})]

['Irène', 'Pierre', 'Marie']

# Finding bornCountry subleties in Poland

In [19]:
case_sensitive = db.laureates.distinct("bornCountry",{"bornCountry": {"$regex": "Poland"}})
case_sensitive

['Austria-Hungary (now Poland)',
 'Free City of Danzig (now Poland)',
 'German-occupied Poland (now Poland)',
 'Germany (now Poland)',
 'Poland',
 'Poland (now Belarus)',
 'Poland (now Lithuania)',
 'Poland (now Ukraine)',
 'Prussia (now Poland)',
 'Russian Empire (now Poland)']

## Using case insensitive regex

In [20]:
case_insensitive = db.laureates.distinct(
"bornCountry",
{"bornCountry": {"$regex": "poland", "$options": "i"}})
assert set(case_sensitive) == set(case_insensitive)

## Regex alternatives with BSON and re

In [21]:
from bson.regex import Regex
db.laureates.distinct("bornCountry",
{"bornCountry": Regex("poland", "i")})

['Austria-Hungary (now Poland)',
 'Free City of Danzig (now Poland)',
 'German-occupied Poland (now Poland)',
 'Germany (now Poland)',
 'Poland',
 'Poland (now Belarus)',
 'Poland (now Lithuania)',
 'Poland (now Ukraine)',
 'Prussia (now Poland)',
 'Russian Empire (now Poland)']

In [22]:
import re
db.laureates.distinct("bornCountry",
{"bornCountry": re.compile("poland", re.I)})

['Austria-Hungary (now Poland)',
 'Free City of Danzig (now Poland)',
 'German-occupied Poland (now Poland)',
 'Germany (now Poland)',
 'Poland',
 'Poland (now Belarus)',
 'Poland (now Lithuania)',
 'Poland (now Ukraine)',
 'Prussia (now Poland)',
 'Russian Empire (now Poland)']

# Basic regex

## Beginning (^)

In [23]:
db.laureates.distinct("bornCountry",{"bornCountry": {"$regex": "^Poland"}})

['Poland',
 'Poland (now Belarus)',
 'Poland (now Lithuania)',
 'Poland (now Ukraine)']

## Escaped characters: enter " \ "

In [25]:
db.laureates.distinct("bornCountry",
                      {'bornCountry': 
                       {'$regex': 
                        '^Poland \(now'
                       }
                      })


['Poland (now Belarus)', 'Poland (now Lithuania)', 'Poland (now Ukraine)']

## Matching the end of the string with $

In [27]:
db.laureates.distinct("bornCountry", {'bornCountry': {'$regex': "now Poland\)$"}})

['Austria-Hungary (now Poland)',
 'Free City of Danzig (now Poland)',
 'German-occupied Poland (now Poland)',
 'Germany (now Poland)',
 'Prussia (now Poland)',
 'Russian Empire (now Poland)']

# Exercises

1) What expression asserts that the distinct Nobel Prize categories cataloged by the "prizes" collection are the same as those cataloged by the "laureates"? 

2) There are some recorded countries of death ("diedCountry") that do not appear as a country of birth ("bornCountry") for laureates. One such country is "East Germany".Return a set of all such countries as countries.

3) Determine the number of distinct countries recorded as part of an affiliation for laureates' prizes. Save this as count.

4) In which countries have USA-born laureates had affiliations for their prizes?

5)
- Save a filter document criteria that, when passed to db.prizes.distinct, returns all prize categories shared by three or more laureates. That is, "laureates.2" must exist for such documents.
- Save these prize categories as a Python set called triple_play_categories.
Confirm via an assertion that "literature" is the only prize category with no prizes shared by three or more laureates.

6) What is the approximate ratio of the number of laureates who won an unshared ({"share": "1"}) prize in physics after World War II ({"year": {"$gte": "1945"}}) to the number of laureates who won a shared prize in physics after World War II?

7) What is this ratio for prize categories other than physics, chemistry, and medicine?

8) What is this ratio for prize categories other than physics, chemistry, and medicine?

9) There are two laureates with Berkeley, California as a prize affiliation city that have the initials G.S. - Glenn Seaborg and George Smoot. How many laureates in total have a first name beginning with "G" and a surname beginning with "S"?

10) Just as we saw with Poland, there are laureates who were born somewhere that was in Germany at the time but is now not, and others born somewhere that was not in Germany at the time but now is. Find them all

11) Three people shared a Nobel prize "for their researches on semiconductors and their discovery of the transistor effect". We can filter on "transistor" as a substring of a laureate's "prizes.motivation" field value to find these laureates.