In [23]:
%load_ext autoreload
%autoreload 2
from pymongo import MongoClient
import sys
from pathlib import Path
from tqdm import tqdm
import json
from bson import ObjectId

sys.path.append(str(Path("..").resolve()))
from src import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Seeding Missing Collections and Entries
ℹ️ This notebook require the execution of [`1.1-Migrating_MySQL_To_MongoDB`](1.1-Migrating_MySQL_To_MongoDB.ipynb). Or you can just load the snapshop as in the following cell.

## Load post-preprocessing data
The following cell reset the `omero_museum` database loading the `2_migration_preprocessed` snapshot.

In [24]:
%%capture
MongoClient("mongodb://localhost:27017/").drop_database("omero_museum")
!mongorestore --host localhost:27017 --drop --db omero_museum  "../backup/2_migration_preprocessed/omero_museum"

In [25]:
connector = MongoDBConnector("omero_museum")
db = connector.db

The collections of the [1m[33momero_museum[0m db are:
----------------------------------------
[activities]:
[artworks]: [1m[31m_id[0m [1m[33mdate[0m [1m[32mdescription[0m [1m[36mroom[0m [1m[34mtype[0m
[authors]: [1m[31m_id[0m [1m[33mbirth_date[0m [1m[32mgender[0m [1m[36mhome_town[0m [1m[34mname[0m [1m[35msurname[0m
[departments]: [1m[31m_id[0m [1m[33mfloor[0m [1m[32mfree_spots[0m [1m[36mroom[0m
[limited_events]:
[messages]:
[roles]: [1m[31m_id[0m [1m[33mbirth_date[0m [1m[32mcurriculum[0m [1m[36mdate_start[0m [1m[34memail[0m [1m[35mgender[0m [1m[37mhometown[0m [1m[90mname[0m [1m[91mphone_number[0m [1m[93msurname[0m
[rooms]:
[suppliers]:
[surveys]: [1m[31m_id[0m [1m[33maccompanying_persons_visit[0m [1m[32mdate_of_compilation[0m [1m[36mevaluation_of_experience[0m [1m[34mevaluation_of_facility[0m [1m[35mevaluation_of_visit[0m [1m[37mnumber_of_visits[0m [1m[90mreason_for_visit[0m [1m[91m

## Seeding DB


### 1. Defining documents' IDs

In [26]:
ids = {
    "authors": [_["_id"] for _ in db.authors.find({})],
    "workshops": [_["_id"] for _ in db.workshops.find({})],
    "departments": [_["_id"] for _ in db.departments.find({})],
    "surveys": [ObjectId() for _ in range(100)],
    "visitors": [ObjectId() for _ in range(100)],
    "tickets": [ObjectId() for _ in range(500)],
    "activities": [ObjectId() for _ in range(100)],
    "messages": [ObjectId() for _ in range(1000)],
    "suppliers": [ObjectId() for _ in range(100)],
    "roles": [ObjectId() for _ in range(100)],
    "chats": [ObjectId() for _ in range(100)],
    "trades": [ObjectId() for _ in range(100)],
    "comments": [ObjectId() for _ in range(1000)],
    "rooms": [
        "Greco e Romano",
        "Medievale e 400",
        "Ancona",
        "Rinascimentale",
        "Contemporaneo",
        "Impressionismo",
    ],
    "artworks": [
        "Mona Lisa",
        "The Starry Night",
        "The Persistence of Memory",
        "The Birth of Venus",
        "The Night Watch",
        "Girl with a Pearl Earring",
        "Guernica",
        "American Gothic",
        "The Scream",
        "Les Demoiselles d'Avignon",
        "The Kiss",
        "The Last Supper",
        "Liberty Leading the People",
        "The Great Wave off Kanagawa",
        "Nighthawks",
        "Campbell's Soup Cans",
        "Impression, Sunrise",
        "No. 5, 1948",
        "A Sunday Afternoon on the Island of La Grande Jatte",
        "The School of Athens",
    ],
    "limited_events": [
        "Digital Preservation of Cultural Heritage",
        "Museum Education in the 21st Century",
        "Community Engagement and Inclusion",
        "Sustainability in Museum Practices",
        "Virtual and Augmented Reality in Exhibitions",
        "Ethics of Artifact Repatriation",
        "Artificial Intelligence in Collections Management",
        "Storytelling Through Curatorial Design",
        "Accessibility and Universal Design in Museums",
        "Cross-Cultural Dialogue Through Exhibits",
        "The Future of Archaeological Conservation",
        "Museums as Spaces for Social Justice",
        "Data-Driven Visitor Experience",
        "Collaborative Curation with Indigenous Communities",
        "The Role of Museums in Climate Change Awareness",
        "Gamification of Museum Learning",
        "Blockchain for Provenance Tracking",
        "Digital Twins of Artifacts",
        "Museums and Mental Health",
        "Hybrid Exhibitions: Physical Meets Digital",
    ],
}

### 2. Defining constants

In [27]:
themes = [
    "Preservation",
    "Education",
    "Engagement",
    "Sustainability",
    "Virtuality",
    "Repatriation",
    "AI",
    "Storytelling",
    "Accessibility",
    "Dialogue",
    "Conservation",
    "Justice",
    "Data",
    "Collaboration",
    "Climate",
    "Gamification",
    "Blockchain",
    "DigitalTwins",
    "Wellbeing",
    "Hybridization",
]
surveys = db.surveys.find({}, {"_id": 0}).to_list()
artwork_descriptions = [doc["description"] for doc in db.artworks.find({})]
artwork_types = ["cast", "statue", "mechanical", "painting", "picture", "relief"]
artwork_periods = [
    "Prehistoric",
    "Ancient",
    "Classical",
    "Medieval",
    "Renaissance",
    "Baroque",
    "Industrial",
    "Modern",
]
artwork_techniques = [
    "OilPainting",
    "Fresco",
    "Sculpture",
    "Engraving",
    "Mosaic",
    "Watercolor",
    "Etching",
    "Photography",
    "Installation",
    "DigitalArt",
]
artwork_materials = [
    "Marble",
    "Bronze",
    "Wood",
    "Canvas",
    "Paper",
    "Clay",
    "Glass",
    "Stone",
    "Textile",
    "Steel",
]

### 3. Seed Aggregates

In [None]:
# Aggregates of aggregates
messages_seeder = DocSeeder(
    {
        "customer_id": EntrySeeder(ids["visitors"]),
        "sent_date": EntrySeeder("past_datetime"),
        "type": EntrySeeder(["text", "audio", "picture"]),
        "content": EntrySeeder("text"),
        "delivery_status": EntrySeeder(["sending", "sent", "received", "seen"]),
        "length": EntrySeeder(range(10, 300, 5), p=0.25),
        "uri": EntrySeeder("url", p=0.35),
    }
)
messages = messages_seeder.seed(500)

In [29]:
# Aggregates
aggregate_seeders = {
    "comments": DocSeeder(
        {
          "_id": EntrySeeder(ids["comments"], unique=True),

            "rating": EntrySeeder(range(1, 6)),
            "date": EntrySeeder("past_datetime"),
            "message": EntrySeeder("text"),
        }
    ),
    "trades": DocSeeder(
        {            "_id": EntrySeeder(ids["trades"], unique=True),

            "date_start": EntrySeeder("past_datetime"),
            "date_end": EntrySeeder("past_datetime"),
            "win_price": EntrySeeder(range(1000, 1000000, 10)),
            "winner_id": EntrySeeder(ids["visitors"]),
            "price_max": EntrySeeder(range(1000, 1000000, 10)),
            "price_min": EntrySeeder(range(0, 5000, 5)),
            "price_start": EntrySeeder(range(0, 1000, 5)),
            "min_increase": EntrySeeder(range(0, 100, 5)),
            "is_auction": EntrySeeder("boolean"),
        }
    ),
    "tickets": DocSeeder(
        {
            "_id": EntrySeeder(ids["tickets"], unique=True),
            "price": EntrySeeder([0, 4.5, 6, 0, 4.5, 6, 12, 70]),
            "date": EntrySeeder("past_datetime"),
            "check_in": EntrySeeder("boolean"),
        }
    ),
    "chats": DocSeeder(
        {
            "_id": EntrySeeder(ids["chats"], unique=True),
            "date_creation": EntrySeeder("past_datetime"),
            "new_messages": EntrySeeder(
                lambda _: _.random_elements(messages, length=10, unique=True)
            ),
            "old_messages": EntrySeeder(
                lambda _: _.random_elements(
                    ids["messages"], length=_.random_int(min=1,max=20), unique=True
                ), p=0.85
            ),
        }
    ),
    "shifts": DocSeeder(
        {
            "time_start": EntrySeeder("time"),
            "time_end": EntrySeeder("time"),
            "day_of_week": EntrySeeder("day_of_week"),
        }
    ),
}
comments = list(aggregate_seeders["comments"].seed(500))
trades = list(aggregate_seeders["trades"].seed(100))
tickets = list(aggregate_seeders["tickets"].seed(500))
chats = list(aggregate_seeders["chats"].seed(100))
shifts = list(aggregate_seeders["shifts"].seed(100))

### 4. Seed Collections

In [30]:
seeders = {
    "rooms": DocSeeder(
        {
            "_id": EntrySeeder(ids["rooms"], unique=True),
            "floor": EntrySeeder(range(5), p=0.6),
        }
    ),
    "activities": DocSeeder(
        {
            "_id": EntrySeeder(ids["activities"], unique=True),
            "room": EntrySeeder(ids["rooms"]),
            "duration": EntrySeeder(range(15, 600, 15)),
            "enrolled": EntrySeeder(range(20)),
            "start_date": EntrySeeder("past_datetime"),
            "capacity": EntrySeeder(range(20, 100)),
            "ticket_ids": EntrySeeder(
                lambda _: _.random_elements(
                    ids["tickets"],
                    length=_.random_int(min=1, max=15),
                    unique=True,
                ),
                p=0.75,
            ),
            "workshop_title": EntrySeeder(ids["workshops"], p=0.4),
        }
    ),
    "limited_events": DocSeeder(
        {
            "_id": EntrySeeder(ids["limited_events"], unique=True),
            "capacity": EntrySeeder(range(20, 50)),
            "type": EntrySeeder(["exhibition", "conference"]),
            "start_date": EntrySeeder("past_datetime"),
            "end_date": EntrySeeder("past_datetime"),
            "artist": EntrySeeder(range(10)),
            "theme": EntrySeeder(themes, p=0.5),
            "description": EntrySeeder("text"),
            "room_name": EntrySeeder(ids["rooms"]),
            "author_ids": EntrySeeder(
                lambda _: _.random_elements(
                    range(15), length=_.random_int(min=1, max=4), unique=True
                )
            ),
            "artwork_titles": EntrySeeder(
                lambda _: _.random_elements(
                    ids["artworks"], length=_.random_int(min=3, max=10), unique=True
                ),
                p=0.8,
            ),
            "ticket_ids": EntrySeeder(
                lambda _: _.random_elements(
                    ids["tickets"], length=_.random_int(min=1, max=10), unique=True
                ),
                p=0.85,
            ),
        }
    ),
    "messages": DocSeeder(
        {
            "_id": EntrySeeder(ids["messages"], unique=True),
            "customer_id": EntrySeeder(ids["visitors"]),
            "sent_date": EntrySeeder("past_datetime"),
            "type": EntrySeeder(["text", "audio", "picture"]),
            "content": EntrySeeder("text"),
            "delivery_status": EntrySeeder(["sending", "sent", "received", "seen"]),
            "length": EntrySeeder(range(10, 300, 5), p=0.25),
            "uri": EntrySeeder("url", p=0.35),
        }
    ),
    "suppliers": DocSeeder(
        {
            "_id": EntrySeeder(ids["suppliers"], unique=True),
            "name": EntrySeeder("name"),
            "iban": EntrySeeder("iban"),
            "is_state": EntrySeeder("boolean", p=0.5),
            "is_museum": EntrySeeder("boolean"),
            "email": EntrySeeder("email"),
            "phone_num": EntrySeeder("phone_number"),
            "survey_ids": EntrySeeder(
                lambda _: _.random_elements(
                    ids["visitors"], length=_.random_int(min=2, max=10), unique=True
                ),
                p=0.5,
            ),
        }
    ),
    "artworks": DocSeeder(
        {
            "_id": EntrySeeder(ids["artworks"], unique=True),
            "date": EntrySeeder("past_datetime"),
            "type": EntrySeeder(artwork_types),
            "description": EntrySeeder(artwork_descriptions),
            "is_original": EntrySeeder("boolean"),
            "size": EntrySeeder(range(10, 500, 10)),
            "period": EntrySeeder(artwork_periods),
            "seller_id": EntrySeeder(ids["suppliers"], p=0.35),
            "donator_id": EntrySeeder(ids["visitors"], p=0.25),
            "donation_state": EntrySeeder(
                ["in_progress", "accepted", "refused"], p=0.25
            ),
            "location_name": EntrySeeder("city"),
            "author_ids": EntrySeeder(
                lambda _: _.random_elements(
                    ids["authors"], length=_.random_int(min=1, max=3)
                )
            ),
            "tecniques": EntrySeeder(
                lambda _: _.random_elements(
                    artwork_techniques, length=_.random_int(min=1, max=3)
                )
            ),
            "materials": EntrySeeder(
                lambda _: _.random_elements(
                    artwork_materials, length=_.random_int(min=1, max=5)
                ),
                p=0.75,
            ),
            "comments_star_1": EntrySeeder(
                lambda _: _.random_elements(
                    comments, length=_.random_int(min=1, max=5)
                ),
                p=0.75,
            ),
            "comments_star_2": EntrySeeder(
                lambda _: _.random_elements(
                    comments, length=_.random_int(min=1, max=5)
                ),
                p=0.75,
            ),
            "comments_star_3": EntrySeeder(
                lambda _: _.random_elements(
                    comments, length=_.random_int(min=1, max=5)
                ),
                p=0.75,
            ),
            "comments_star_4": EntrySeeder(
                lambda _: _.random_elements(
                    comments, length=_.random_int(min=1, max=5)
                ),
                p=0.75,
            ),
            "comments_star_5": EntrySeeder(
                lambda _: _.random_elements(
                    comments, length=_.random_int(min=1, max=5)
                ),
                p=0.75,
            ),
            "trade": EntrySeeder(trades),
        }
    ),
    "visitors": DocSeeder(
        {
            "_id": EntrySeeder(ids["visitors"], unique=True),
            "is_customer": EntrySeeder("boolean"),
            "birth_date": EntrySeeder("past_datetime"),
            "impairment": EntrySeeder(["deaf", "blind"], p=0.75),
            "gender": EntrySeeder(["male", "female"]),
            "land": EntrySeeder("country"),
            "tickets": EntrySeeder(
                lambda _: _.random_elements(tickets, length=_.random_int(min=1, max=4)),
                p=0.75,
            ),
            "surveys": EntrySeeder(
                lambda _: _.random_elements(surveys, length=_.random_int(min=1, max=3)),
                p=0.75,
            ),
            "surname": EntrySeeder("last_name"),
            "name": EntrySeeder("first_name"),
            "phone_num": EntrySeeder("phone_number"),
            "email": EntrySeeder("email"),
            "chat": EntrySeeder(chats, p=0.5),
            "comments": EntrySeeder(
                lambda _: _.random_elements(
                    comments, length=_.random_int(min=1, max=3)
                ),
                p=0.5,
            ),
            "donation_ids": EntrySeeder(
                lambda _: _.random_elements(
                    ids["artworks"], length=_.random_int(min=1, max=3)
                ),
                p=0.35,
            ),
            "sale_ids": EntrySeeder(
                lambda _: _.random_elements(
                    ids["artworks"], length=_.random_int(min=1, max=3)
                ),
                p=0.35,
            ),
            "trade_ids": EntrySeeder(
                lambda _: _.random_elements(
                    ids["artworks"], length=_.random_int(min=1, max=3)
                ),
                p=0.5,
            ),
        }
    ),
    "roles": DocSeeder(
        {
            "_id": EntrySeeder(ids["roles"], unique=True),
            "phone_number": EntrySeeder("phone_number"),
            "surname": EntrySeeder("last_name"),
            "curriculum": EntrySeeder("url"),
            "birth_date": EntrySeeder("past_datetime"),
            "date_start": EntrySeeder("past_datetime"),
            "email": EntrySeeder("email"),
            "hometown": EntrySeeder("city"),
            "name": EntrySeeder("first_name"),
            "gender": EntrySeeder(["F", "M"]),
            "username": EntrySeeder("user_name"),
            "password": EntrySeeder("md5"),
            "salary": EntrySeeder(range(10_000, 80_000, 500)),
            "desk": EntrySeeder(range(10), p=0.5),
            "department": EntrySeeder(ids["departments"]),
            "type": EntrySeeder(
                ["secretary", "guide", "administrator", "designer", "teaching staff"]
            ),
            "english_cert": EntrySeeder(
                ["A1", "A2", "B1", "B2", "C1", "C2", "native"], p=0.5
            ),
            "shifts": EntrySeeder(
                lambda _: _.random_elements(
                    shifts, length=_.random_int(min=4, max=10), unique=True
                ),
            ),
            "activity_ids": EntrySeeder(
                lambda _: _.random_elements(
                    ids["activities"], length=_.random_int(min=1, max=6), unique=True
                ),
                p=0.85,
            ),
            "chat_ids": EntrySeeder(
                lambda _: _.random_elements(
                    ids["chats"], length=_.random_int(min=1, max=5), unique=True
                ),
                p=0.85,
            ),
        }
    ),
}

In [31]:
# Seed!
for coll, seeder in seeders.items():
    cprint("Seeding", f"green:{coll}", "...")
    db[coll].delete_many({})
    db[coll].insert_many(seeder.seed(len(ids[coll])))

Seeding [1m[32mrooms[0m ...
Seeding [1m[32mactivities[0m ...
Seeding [1m[32mlimited_events[0m ...
Seeding [1m[32mmessages[0m ...
Seeding [1m[32msuppliers[0m ...
Seeding [1m[32martworks[0m ...
Seeding [1m[32mvisitors[0m ...
Seeding [1m[32mroles[0m ...


### 5. Remove Unneeded Collections

In [32]:
for coll in ["surveys", "tickets"]:
    cprint("Dropping", f"red:{coll}", "...")
    db.drop_collection(coll)

Dropping [1m[31msurveys[0m ...
Dropping [1m[31mtickets[0m ...


From the migration, the following collections were retained
- departments
- authors
- workshops
- surveys

---
## Dump Final Database

In [33]:
%%capture
!mongodump --host localhost:27017 --db omero_museum --out "../backup/3_seeded"