MongoDB Lab: Airbnb Dataset

The sample_airbnb.listingsAndReviews collection contains Airbnb property listings with property details, location data, pricing, host information, reviews and ratings, and amenities.

In [3]:
from pymongo import MongoClient
import pandas as pd

In [5]:
# Connection string (replace with your connection details)
client = MongoClient("mongodb://localhost:27017/")
# OR for Atlas: 
# client = MongoClient("mongodb+srv://username:password@cluster.mongodb.net/")

db = client.sample_supplies
collection = db.sales

# Test connection
print(f"Total listings: {collection.count_documents({})}")

Total listings: 5000


In [None]:
pipeline = [
    # 1. Desenrollar los items del array
    {"$unwind": "$items"},

    # 2. Agrupar por fecha y por tipo de producto
    {
        "$group": {
            "_id": {
                "date": {"$dateToString": {"format": "%Y-%m-%d", "date": "$saleDate"}},
                "product": "$items.name"
            },
            "total_quantity": {"$sum": "$items.quantity"}
        }
    },

    # 3. Ordenar por fecha y luego por nombre de producto
    {
        "$sort": {
            "_id.date": 1,
            "_id.product": 1
        }
    }
]

results = list(collection.aggregate(pipeline))
df = pd.DataFrame(results)
df = df.join(pd.json_normalize(df["_id"]))
df = df.drop(columns=["_id"])

Unnamed: 0,total_quantity,date,product
0,5,2013-01-01,backpack
1,29,2013-01-01,binder
2,14,2013-01-01,envelopes
3,6,2013-01-01,laptop
4,4,2013-01-01,notepad
...,...,...,...
10131,26,2017-12-31,binder
10132,33,2017-12-31,envelopes
10133,17,2017-12-31,notepad
10134,20,2017-12-31,pens


In [None]:
pipeline = [
    {"$unwind": "$items"},

    {
        "$group": {
            "_id": {
                "$dateToString": {"format": "%Y-%m-%d", "date": "$saleDate"}
            },
            "total_revenue": {
                "$sum": {
                    "$multiply": ["$items.price", "$items.quantity"]
                }
            }
        }
    },

    {"$sort": {"_id": 1}}
]

result = list(collection.aggregate(pipeline))
df1 = pd.DataFrame(result)
df1 = df1.rename(columns={"_id": "date"})