# Setup

In [2]:
MONGODB_START_FROM_SCRATCH = True
DOCKER_INTERNAL_HOST = "host.docker.internal"
DOCKER_DNS = ["10.15.20.1"]

MONGODB_REPLICA_SET = "replica_set_0"
MONGODB_TOTAL_NODES = 3

MONGODB_NODE_IPS = ["10.15.20.2"] * MONGODB_TOTAL_NODES
MONGODB_NODE_NAMES = [f"mongodb-node-{i + 1}" for i in range(MONGODB_TOTAL_NODES)]
MONGODB_NODE_HOSTNAMES = [
    f"{MONGODB_NODE_NAMES[i]}.mavasbel.vpn.itam.mx" for i in range(MONGODB_TOTAL_NODES)
]
MONGODB_NODE_PORTS = [27010 + (i + 1) for i in range(0, MONGODB_TOTAL_NODES)]

MONGODB_WORKDIR = "/data/db"

MONGO_INITDB_ROOT_USERNAME = "admin"
MONGO_INITDB_ROOT_PASSWORD = "admin"
MONGO_INITDB_DATABASE = "admin"

In [3]:
import os
from pathlib import Path

LOCALHOST_WORKDIR = f"{os.path.join(os.path.relpath(Path.cwd()))}"
DOCKER_MOUNTDIR = os.path.join(LOCALHOST_WORKDIR, "mount")
MONGODB_LOCAL_CLUSTER_KEY_PATH = os.path.join(DOCKER_MOUNTDIR, "mongo-keyfile")

mount_path = Path(DOCKER_MOUNTDIR)
mount_path.mkdir(parents=True, exist_ok=True)

### Create session

In [4]:
from pymongo import MongoClient

nodes_ports = [
    f"{MONGODB_NODE_HOSTNAMES[i]}:{MONGODB_NODE_PORTS[i]}"
    for i in range(MONGODB_TOTAL_NODES)
]
connection_string = (
    f"mongodb://{MONGO_INITDB_ROOT_USERNAME}:{MONGO_INITDB_ROOT_PASSWORD}@"
    f"{','.join(nodes_ports)}/"
    f"?replicaSet={MONGODB_REPLICA_SET}&authSource=admin&w=majority"
)
print(f"Connectoin URL: {connection_string}")

client = MongoClient(connection_string)

db = client["db"]
users_collection = db["users"]

Connectoin URL: mongodb://admin:admin@mongodb-node-1.mavasbel.vpn.itam.mx:27011,mongodb-node-2.mavasbel.vpn.itam.mx:27012,mongodb-node-3.mavasbel.vpn.itam.mx:27013/?replicaSet=replica_set_0&authSource=admin&w=majority


### Insert

In [5]:
from faker import Faker

fake = Faker()

In [6]:
# %%timeit -n 2 -r 2
# -n 1: run only 2 loop
# -r 1: repeat only 2 time

import random

print("Generating batch...")

users_batch = [
    {
        "name": (
            fake.unique.name() if random.random() > 0.5 else fake.unique.name().upper()
        ),
        "email": fake.ascii_free_email(),
        "profile": {
            "job": fake.job(),
            "company": fake.company(),
            "location": {
                "lat": float(fake.latitude()),
                "lng": float(fake.longitude()),
            },
        },
        "tags": [fake.word() for _ in range(random.randint(2, 5))],
        "login_count": random.randint(1, 1000),
        "last_login": fake.date_time_this_year().isoformat(),
        "active": fake.boolean(chance_of_getting_true=75),
    }
    for _ in range(10000)
]
print("Inserting batch...")
users_collection.insert_many(users_batch)

Generating batch...
Inserting batch...


InsertManyResult([ObjectId('69631d98a1ac76ec320882e5'), ObjectId('69631d98a1ac76ec320882e6'), ObjectId('69631d98a1ac76ec320882e7'), ObjectId('69631d98a1ac76ec320882e8'), ObjectId('69631d98a1ac76ec320882e9'), ObjectId('69631d98a1ac76ec320882ea'), ObjectId('69631d98a1ac76ec320882eb'), ObjectId('69631d98a1ac76ec320882ec'), ObjectId('69631d98a1ac76ec320882ed'), ObjectId('69631d98a1ac76ec320882ee'), ObjectId('69631d98a1ac76ec320882ef'), ObjectId('69631d98a1ac76ec320882f0'), ObjectId('69631d98a1ac76ec320882f1'), ObjectId('69631d98a1ac76ec320882f2'), ObjectId('69631d98a1ac76ec320882f3'), ObjectId('69631d98a1ac76ec320882f4'), ObjectId('69631d98a1ac76ec320882f5'), ObjectId('69631d98a1ac76ec320882f6'), ObjectId('69631d98a1ac76ec320882f7'), ObjectId('69631d98a1ac76ec320882f8'), ObjectId('69631d98a1ac76ec320882f9'), ObjectId('69631d98a1ac76ec320882fa'), ObjectId('69631d98a1ac76ec320882fb'), ObjectId('69631d98a1ac76ec320882fc'), ObjectId('69631d98a1ac76ec320882fd'), ObjectId('69631d98a1ac76ec320882

### Query

In [7]:
query = {"active": True, "login_count": {"$gt": 500}}
results = users_collection.find(query)
print(f"Found {users_collection.count_documents(query)} highly active users.")

Found 3780 highly active users.


In [8]:
projection = {"name": 1, "email": 1, "profile.job": 1, "_id": 0}
cursor = users_collection.find({"tags": "work"}, projection).limit(100)
for user in cursor:
    print(user)

{'name': 'JOSHUA FITZGERALD', 'email': 'tinahogan@hotmail.com', 'profile': {'job': 'Nutritional therapist'}}
{'name': 'Shawn Kelley', 'email': 'jimmy72@gmail.com', 'profile': {'job': 'Music tutor'}}
{'name': 'Jennifer Washington', 'email': 'latoyapeterson@gmail.com', 'profile': {'job': 'Psychiatrist'}}
{'name': 'TARA RUIZ', 'email': 'nweaver@yahoo.com', 'profile': {'job': 'Surveyor, building control'}}
{'name': 'Travis Lamb', 'email': 'gardnerjay@gmail.com', 'profile': {'job': 'Senior tax professional/tax inspector'}}
{'name': 'Jordan Johnson', 'email': 'gonzalezvernon@hotmail.com', 'profile': {'job': 'Industrial/product designer'}}
{'name': 'FRANCES LOWE', 'email': 'paulaarmstrong@yahoo.com', 'profile': {'job': 'Magazine features editor'}}
{'name': 'TRACY STEWART DVM', 'email': 'lucasrachel@gmail.com', 'profile': {'job': 'Transport planner'}}
{'name': 'PAUL NELSON', 'email': 'lreyes@hotmail.com', 'profile': {'job': 'Health service manager'}}
{'name': 'Roy Johnson', 'email': 'rileyjona

In [9]:
pipeline = [
    {"$match": {"active": True}},  # Stage 1: Filter only active users
    {  # Stage 2: Group by the nested 'job' field
        "$group": {
            "_id": "$profile.job",
            "avg_logins": {"$avg": "$login_count"},
            "user_count": {"$sum": 1},
        }
    },
    {"$sort": {"avg_logins": -1}},  # Stage 3: Sort by average logins descending
    {
        "$project": {
            "_id": 0,  # Hide the original _id
            "job_title": "$_id",  # Rename _id to job_title
            "stats": {  # Create a nested object for stats
                "average": "$avg_logins",
                "total_users": "$user_count",
            },
        }
    },
    {"$limit": 100},  # Stage 4: Limit to top 100 most active professions
]
results = list(users_collection.aggregate(pipeline))
for res in results:
    print(res)

{'job_title': 'Systems developer', 'stats': {'average': 834.5, 'total_users': 4}}
{'job_title': 'Dance movement psychotherapist', 'stats': {'average': 755.9090909090909, 'total_users': 11}}
{'job_title': 'Pharmacist, hospital', 'stats': {'average': 738.2727272727273, 'total_users': 11}}
{'job_title': 'Legal secretary', 'stats': {'average': 734.4545454545455, 'total_users': 11}}
{'job_title': 'Scientist, product/process development', 'stats': {'average': 703.625, 'total_users': 8}}
{'job_title': 'Administrator, education', 'stats': {'average': 703.4285714285714, 'total_users': 7}}
{'job_title': 'Administrator, Civil Service', 'stats': {'average': 702.5555555555555, 'total_users': 9}}
{'job_title': 'Conservation officer, nature', 'stats': {'average': 695.0769230769231, 'total_users': 13}}
{'job_title': 'Youth worker', 'stats': {'average': 682.6666666666666, 'total_users': 9}}
{'job_title': 'Development worker, community', 'stats': {'average': 674.4666666666667, 'total_users': 15}}
{'job_

In [10]:
northern_users = users_collection.count_documents({"profile.location.lat": {"$gt": 0}})
print(f"Users in Northern Hemisphere: {northern_users}")

Users in Northern Hemisphere: 4985


In [11]:
# Standard Sort (Z-A-a-z) vs. Collation Sort (A-a-B-b...)
cursor = users_collection.find({}).sort("name", 1).collation({"locale": "en", "strength": 2}).limit(100)

for user in cursor:
    print(user["name"])

AARON ALLEN
AARON ANDERSON
Aaron Austin
Aaron Bond
AARON BROWN
Aaron Carter
Aaron Clark
AARON COMBS
Aaron David
Aaron Drake
Aaron Dunn
Aaron Edwards
AARON ELLIOTT
Aaron Figueroa
Aaron Garcia
AARON GORDON
Aaron Green
Aaron Jackson
Aaron Kennedy
Aaron Kim
AARON KING
AARON MARSHALL
Aaron Miller
Aaron Phillips
AARON RICHARDSON
AARON ROY
AARON SANCHEZ
Aaron Smith
AARON SMITH MD
AARON TORRES
AARON WARNER
Abigail Bailey
Abigail Cohen
Abigail Freeman
ABIGAIL GARCIA
ABIGAIL HAYS
ABIGAIL RICHARDS
Abigail Shelton
Abigail Spears
ADAM ANDERSON
Adam Dean
Adam Evans
Adam Farmer
Adam Freeman
Adam Gill
ADAM HAYES
ADAM HERNANDEZ
Adam Hughes
ADAM JONES
ADAM LARSON
Adam Lutz
Adam Martinez
ADAM MORGAN
Adam Newton
Adam Parker
ADAM PORTER
Adam Roberts
ADAM ROSE
ADAM SMITH
ADAM TAYLOR
ADAM VELAZQUEZ
ADAM WASHINGTON
ADAM WEST
ADAM WILLIAMS
ADAM WILLIAMSON
ADAM WILLIS
ADAM WILSON
Adam Wong
Adam Woods
Adrian Adams
Adrian Avila
Adrian Dominguez
Adrian Jones
Adrian Jordan
Adrian Larson
Adrian Prince
ADRIAN WASHING

### Update

In [12]:
# 1. Get a single user to test with
target_user = users_collection.find_one({"active": True})
user_id = target_user["_id"]
initial_logins = target_user.get("login_count", 0)

print(f"User: {target_user['name']}")
print(f"Initial login count: {initial_logins}")

# 2. Increment the login counter for JUST this user
users_collection.update_one(
    {"_id": user_id}, 
    {"$inc": {"login_count": 1}}
)

# 3. Query again to see the change
updated_user = users_collection.find_one({"_id": user_id})
new_logins = updated_user.get("login_count", 0)

print(f"Updated login count: {new_logins}")
print(f"Change confirmed: {new_logins == initial_logins + 1}")

User: Kelsey Lindsey
Initial login count: 637
Updated login count: 638
Change confirmed: True


In [13]:
from pymongo import ReturnDocument

# This performs the update and returns the NEW version of the document immediately
updated_doc = users_collection.find_one_and_update(
    {"_id": user_id},
    {"$inc": {"login_count": 1}},
    return_document=ReturnDocument.AFTER
)

print(f"New count from single-step operation: {updated_doc['login_count']}")

New count from single-step operation: 639


In [14]:
query = {"profile.job": {"$regex": ".*engineer.*", "$options": "i"}}
update = {"$set": {"is_technical": True}}
result = users_collection.update_many(query, update)
print(f"Updated {result.modified_count} engineers.")

Updated 976 engineers.


In [15]:
query = {"email": "example@user.com"}
new_values = {"$set": {"active": False}}
users_collection.update_one(query, new_values)

UpdateResult({'n': 0, 'electionId': ObjectId('7fffffff0000000000000001'), 'opTime': {'ts': Timestamp(1768103321, 4067), 't': 1}, 'nModified': 0, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1768103321, 4067), 'signature': {'hash': b'\x9f\xa8\xbb.\x0f\x87\xeb\xe9\xe5\x17e\xe4\xe3:G\xee\xfb*|C', 'keyId': 7593945385593208838}}, 'operationTime': Timestamp(1768103321, 4067), 'updatedExisting': False}, acknowledged=True)

### Delete

In [16]:
delete_result = users_collection.delete_many({})
print(f"Deleted {delete_result.deleted_count} documents.")

Deleted 10000 documents.


In [17]:
db.drop_collection(users_collection)
print("Deleted users collection.")

Deleted users collection.
