# Setup

In [18]:
MONGODB_START_FROM_SCRATCH = True
DOCKER_INTERNAL_HOST = "host.docker.internal"
DOCKER_DNS = ["10.15.20.1"]

MONGODB_REPLICA_SET = "replica_set_0"
MONGODB_TOTAL_NODES = 3

MONGODB_NODE_IPS = ["10.15.20.2"] * MONGODB_TOTAL_NODES
MONGODB_NODE_NAMES = [f"mongodb-node-{i + 1}" for i in range(MONGODB_TOTAL_NODES)]
MONGODB_NODE_HOSTNAMES = [
    f"{MONGODB_NODE_NAMES[i]}.mavasbel.vpn.itam.mx" for i in range(MONGODB_TOTAL_NODES)
]
MONGODB_NODE_PORTS = [27010 + (i + 1) for i in range(0, MONGODB_TOTAL_NODES)]

MONGODB_WORKDIR = "/data/db"

MONGO_INITDB_ROOT_USERNAME = "admin"
MONGO_INITDB_ROOT_PASSWORD = "admin"
MONGO_INITDB_DATABASE = "admin"

In [19]:
import os
from pathlib import Path

LOCALHOST_WORKDIR = f"{os.path.join(os.path.abspath(Path.cwd()))}"
DOCKER_MOUNTDIR = os.path.join(LOCALHOST_WORKDIR, "mount")
MONGODB_LOCAL_CLUSTER_KEY_PATH = os.path.join(DOCKER_MOUNTDIR, "mongo-keyfile")

mount_path = Path(DOCKER_MOUNTDIR)
mount_path.mkdir(parents=True, exist_ok=True)

### Create session

In [20]:
from pymongo import MongoClient

nodes_ports = [
    f"{MONGODB_NODE_HOSTNAMES[i]}:{MONGODB_NODE_PORTS[i]}"
    for i in range(MONGODB_TOTAL_NODES)
]
connection_string = (
    f"mongodb://{MONGO_INITDB_ROOT_USERNAME}:{MONGO_INITDB_ROOT_PASSWORD}@"
    f"{','.join(nodes_ports)}/"
    f"?replicaSet={MONGODB_REPLICA_SET}&authSource=admin&w=majority"
)
print(f"Connectoin URL: {connection_string}")

client = MongoClient(connection_string)

db = client["db"]
users_collection = db["users"]

Connectoin URL: mongodb://admin:admin@mongodb-node-1.mavasbel.vpn.itam.mx:27011,mongodb-node-2.mavasbel.vpn.itam.mx:27012,mongodb-node-3.mavasbel.vpn.itam.mx:27013/?replicaSet=replica_set_0&authSource=admin&w=majority


### Insert

In [21]:
from faker import Faker

fake = Faker()

In [22]:
# %%timeit -n 2 -r 2
# -n 1: run only 2 loop
# -r 1: repeat only 2 time

import random

print("Generating batch...")

users_batch = [
    {
        "name": (
            fake.unique.name() if random.random() > 0.5 else fake.unique.name().upper()
        ),
        "email": fake.ascii_free_email(),
        "profile": {
            "job": fake.job(),
            "company": fake.company(),
            "location": {
                "lat": float(fake.latitude()),
                "lng": float(fake.longitude()),
            },
        },
        "tags": [fake.word() for _ in range(random.randint(2, 5))],
        "login_count": random.randint(1, 1000),
        "last_login": fake.date_time_this_year().isoformat(),
        "active": fake.boolean(chance_of_getting_true=75),
    }
    for _ in range(10000)
]
print("Inserting batch...")
users_collection.insert_many(users_batch)

Generating batch...
Inserting batch...


InsertManyResult([ObjectId('6961fb0179be8ff2003d2a85'), ObjectId('6961fb0179be8ff2003d2a86'), ObjectId('6961fb0179be8ff2003d2a87'), ObjectId('6961fb0179be8ff2003d2a88'), ObjectId('6961fb0179be8ff2003d2a89'), ObjectId('6961fb0179be8ff2003d2a8a'), ObjectId('6961fb0179be8ff2003d2a8b'), ObjectId('6961fb0179be8ff2003d2a8c'), ObjectId('6961fb0179be8ff2003d2a8d'), ObjectId('6961fb0179be8ff2003d2a8e'), ObjectId('6961fb0179be8ff2003d2a8f'), ObjectId('6961fb0179be8ff2003d2a90'), ObjectId('6961fb0179be8ff2003d2a91'), ObjectId('6961fb0179be8ff2003d2a92'), ObjectId('6961fb0179be8ff2003d2a93'), ObjectId('6961fb0179be8ff2003d2a94'), ObjectId('6961fb0179be8ff2003d2a95'), ObjectId('6961fb0179be8ff2003d2a96'), ObjectId('6961fb0179be8ff2003d2a97'), ObjectId('6961fb0179be8ff2003d2a98'), ObjectId('6961fb0179be8ff2003d2a99'), ObjectId('6961fb0179be8ff2003d2a9a'), ObjectId('6961fb0179be8ff2003d2a9b'), ObjectId('6961fb0179be8ff2003d2a9c'), ObjectId('6961fb0179be8ff2003d2a9d'), ObjectId('6961fb0179be8ff2003d2a

### Query

In [23]:
query = {"active": True, "login_count": {"$gt": 500}}
results = users_collection.find(query)
print(f"Found {users_collection.count_documents(query)} highly active users.")

Found 3763 highly active users.


In [24]:
projection = {"name": 1, "email": 1, "profile.job": 1, "_id": 0}
cursor = users_collection.find({"tags": "work"}, projection).limit(100)
for user in cursor:
    print(user)

{'name': 'ROBERT SWANSON', 'email': 'cameron08@yahoo.com', 'profile': {'job': 'Conservation officer, nature'}}
{'name': 'ROBERT CROSS', 'email': 'sarah14@hotmail.com', 'profile': {'job': 'Translator'}}
{'name': 'CARRIE LOPEZ', 'email': 'johnvang@gmail.com', 'profile': {'job': 'Engineer, drilling'}}
{'name': 'MIGUEL BELL', 'email': 'francisjoseph@hotmail.com', 'profile': {'job': 'Training and development officer'}}
{'name': 'Nicole Downs', 'email': 'anthonyjill@yahoo.com', 'profile': {'job': 'Patent examiner'}}
{'name': 'Robert Miller', 'email': 'hernandezmonica@hotmail.com', 'profile': {'job': 'Community development worker'}}
{'name': 'Chelsea Rivera', 'email': 'mercedes42@gmail.com', 'profile': {'job': 'Best boy'}}
{'name': 'Sheila Garcia', 'email': 'michele70@gmail.com', 'profile': {'job': 'Facilities manager'}}
{'name': 'SARAH WILLIAMS', 'email': 'turnerbobby@gmail.com', 'profile': {'job': 'Surveyor, mining'}}
{'name': 'KAYLA MORALES', 'email': 'kjefferson@yahoo.com', 'profile': {'j

In [25]:
pipeline = [
    {"$match": {"active": True}},  # Stage 1: Filter only active users
    {  # Stage 2: Group by the nested 'job' field
        "$group": {
            "_id": "$profile.job",
            "avg_logins": {"$avg": "$login_count"},
            "user_count": {"$sum": 1},
        }
    },
    {"$sort": {"avg_logins": -1}},  # Stage 3: Sort by average logins descending
    {
        "$project": {
            "_id": 0,  # Hide the original _id
            "job_title": "$_id",  # Rename _id to job_title
            "stats": {  # Create a nested object for stats
                "average": "$avg_logins",
                "total_users": "$user_count",
            },
        }
    },
    {"$limit": 100},  # Stage 4: Limit to top 100 most active professions
]
results = list(users_collection.aggregate(pipeline))
for res in results:
    print(res)

{'job_title': 'Comptroller', 'stats': {'average': 845.5714285714286, 'total_users': 7}}
{'job_title': 'IT trainer', 'stats': {'average': 781.375, 'total_users': 8}}
{'job_title': 'Musician', 'stats': {'average': 771.7647058823529, 'total_users': 17}}
{'job_title': 'Engineer, petroleum', 'stats': {'average': 765.6, 'total_users': 5}}
{'job_title': 'Minerals surveyor', 'stats': {'average': 747.8888888888889, 'total_users': 9}}
{'job_title': 'Futures trader', 'stats': {'average': 736.6363636363636, 'total_users': 11}}
{'job_title': 'Personal assistant', 'stats': {'average': 733.2941176470588, 'total_users': 17}}
{'job_title': 'Human resources officer', 'stats': {'average': 733.2857142857143, 'total_users': 14}}
{'job_title': 'Building control surveyor', 'stats': {'average': 731.1111111111111, 'total_users': 18}}
{'job_title': 'Leisure centre manager', 'stats': {'average': 728.1428571428571, 'total_users': 7}}
{'job_title': 'Health promotion specialist', 'stats': {'average': 726.625, 'tota

In [26]:
northern_users = users_collection.count_documents({"profile.location.lat": {"$gt": 0}})
print(f"Users in Northern Hemisphere: {northern_users}")

Users in Northern Hemisphere: 5121


In [27]:
# Standard Sort (Z-A-a-z) vs. Collation Sort (A-a-B-b...)
cursor = users_collection.find({}).sort("name", 1).collation({"locale": "en", "strength": 2}).limit(100)

for user in cursor:
    print(user["name"])

Aaron Allison
Aaron Baker
Aaron Brown
AARON BUSH
AARON ELLIS
AARON FRAZIER
AARON HAYES
AARON JONES
AARON KERR
AARON LUCAS
Aaron Mcdaniel
Aaron Miller
Aaron Murphy
Aaron Norris
Aaron Olson
AARON REED
Aaron Richards
AARON RODRIGUEZ
AARON SHARP
AARON SMITH
AARON WEAVER
Aaron White
Aaron Young
Abigail Adams
ABIGAIL GARCIA
Abigail Henry
ABIGAIL JOHNSON
Abigail King
ABIGAIL MARTIN
Abigail Matthews
Abigail Rodriguez
Abigail Williams
Adam Alexander
ADAM ALLEN
ADAM AYALA
Adam Ball
ADAM BARRETT
Adam Brown
ADAM CARDENAS
ADAM CARLSON
ADAM CHAMBERS
Adam Collins
ADAM FARMER
Adam Gallagher
ADAM GARRETT
ADAM GOMEZ
ADAM GREEN
ADAM GRIMES
Adam Hayes
ADAM HOWARD
ADAM JOHNSON
ADAM JOSEPH
ADAM KELLER
ADAM KELLY
Adam Lynch
Adam Lyons
ADAM MORRIS
ADAM MORTON
ADAM MUELLER
ADAM NELSON
Adam Norman
Adam Phelps
ADAM PHILLIPS
Adam Ramos
ADAM ROACH MD
ADAM SCHMIDT
ADAM SCOTT
ADAM SMITH
ADAM STEWART DDS
Adam Sullivan
Adam Torres
Adam Townsend
ADAM TUCKER
ADAM WARE
ADAM WATSON
ADAM WILSON
ADRIAN COLEMAN
Adrian Green


### Update

In [28]:
# 1. Get a single user to test with
target_user = users_collection.find_one({"active": True})
user_id = target_user["_id"]
initial_logins = target_user.get("login_count", 0)

print(f"User: {target_user['name']}")
print(f"Initial login count: {initial_logins}")

# 2. Increment the login counter for JUST this user
users_collection.update_one(
    {"_id": user_id}, 
    {"$inc": {"login_count": 1}}
)

# 3. Query again to see the change
updated_user = users_collection.find_one({"_id": user_id})
new_logins = updated_user.get("login_count", 0)

print(f"Updated login count: {new_logins}")
print(f"Change confirmed: {new_logins == initial_logins + 1}")

User: ANTHONY SWANSON
Initial login count: 551
Updated login count: 552
Change confirmed: True


In [29]:
from pymongo import ReturnDocument

# This performs the update and returns the NEW version of the document immediately
updated_doc = users_collection.find_one_and_update(
    {"_id": user_id},
    {"$inc": {"login_count": 1}},
    return_document=ReturnDocument.AFTER
)

print(f"New count from single-step operation: {updated_doc['login_count']}")

New count from single-step operation: 553


In [30]:
query = {"profile.job": {"$regex": ".*engineer.*", "$options": "i"}}
update = {"$set": {"is_technical": True}}
result = users_collection.update_many(query, update)
print(f"Updated {result.modified_count} engineers.")

Updated 929 engineers.


In [31]:
query = {"email": "example@user.com"}
new_values = {"$set": {"active": False}}
users_collection.update_one(query, new_values)

UpdateResult({'n': 0, 'electionId': ObjectId('7fffffff0000000000000001'), 'opTime': {'ts': Timestamp(1768028929, 10933), 't': 1}, 'nModified': 0, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1768028929, 10933), 'signature': {'hash': b'\x06\x0fg\\\x15\x05\xee\xb5\x96\x05\xbb\xd4\xb0\x02Q\xda\xcb\xf8\x1f\xda', 'keyId': 7593626394077167621}}, 'operationTime': Timestamp(1768028929, 10933), 'updatedExisting': False}, acknowledged=True)

### Delete

In [32]:
delete_result = users_collection.delete_many({})
print(f"Deleted {delete_result.deleted_count} documents.")

Deleted 10000 documents.


In [33]:
db.drop_collection(users_collection)
print("Deleted users collection.")

Deleted users collection.
