In [5]:
import numpy as np
import pandas as pd
import dotenv
import os
from sqlalchemy import create_engine
import psycopg

dotenv.load_dotenv()
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD")

## Load the raw Shakespeare data

In [6]:
repo = 'https://github.com/jkropko/DS-6001/raw/master/localdata/'
works = pd.read_csv(repo + 'Works.csv')
characters = pd.read_csv(repo + 'Characters.csv')
chapters = pd.read_csv(repo + 'Chapters.csv')
paragraphs = pd.read_csv(repo + 'Paragraphs.csv')


characters.columns = characters.columns.str.lower() 
chapters.columns = chapters.columns.str.lower()
paragraphs.columns = paragraphs.columns.str.lower()
works.columns = works.columns.str.lower()


charworks = characters[['charid', 'works']]
charworks.loc[:,'works'] = charworks['works'].str.split(',')
charworks = charworks.explode('works')
charworks = charworks.rename({'works':'workid'}, axis=1)
characters = characters.drop('works', axis=1)

#Removes empty rows
chapters = chapters.query("~chapterid.isnull()")
paragraphs = paragraphs.query("~paragraphid.isnull()")
charworks = charworks.query("~workid.isnull()")


paragraphs = pd.merge(paragraphs, 
                      chapters.drop('description', axis=1),
                      how='inner', 
                      on=['workid', 'section', 'chapter'])


paragraphs = paragraphs.drop(['paragraphtype', 'section', 'chapter'], 
                             axis=1) 

## Connect to Postgres server, create an *empty* Shakespeare DB (do this ONE time only)

In [7]:
dbserver = psycopg.connect(
    user='postgres', 
    password=POSTGRES_PASSWORD,
    host='localhost',
    port = '5432')

dbserver.autocommit = True

In [8]:
cursor = dbserver.cursor()
try:
    cursor.execute('CREATE DATABASE shakespeare')
except:
    cursor.execute('DROP DATABASE shakespeare')
    cursor.execute('CREATE DATABASE shakespeare')


## Create the SQLAlchemy engine

In [9]:
dbms = 'postgresql'
package = 'psycopg'
user = 'postgres'
password = POSTGRES_PASSWORD
host = 'localhost'
port = '5432'
db = 'shakespeare'

engine = create_engine(f"{dbms}+{package}://{user}:{password}@{host}:{port}/{db}")
engine

Engine(postgresql+psycopg://postgres:***@localhost:5432/shakespeare)

## Put the data into the empty Shakespeare DB (do this ONE time only)

In [10]:
works.to_sql("works", con=engine, if_exists="replace", index=False, chunksize=1000)
characters.to_sql("characters", con=engine, if_exists="replace", index=False, chunksize=1000)
chapters.to_sql("chapters", con=engine, if_exists="replace", index=False, chunksize=1000)
paragraphs.to_sql("paragraphs", con=engine, if_exists="replace", index=False, chunksize=1000)
charworks.to_sql("charworks", con=engine, if_exists="replace", index=False, chunksize=1000)

-2

## Ready to work with SQL

In [11]:
myquery = '''
SELECT * FROM pg_catalog.pg_tables;
'''

pd.read_sql_query(myquery, con=engine)

Unnamed: 0,schemaname,tablename,tableowner,tablespace,hasindexes,hasrules,hastriggers,rowsecurity
0,public,works,postgres,,False,False,False,False
1,public,characters,postgres,,False,False,False,False
2,public,chapters,postgres,,False,False,False,False
3,public,paragraphs,postgres,,False,False,False,False
4,public,charworks,postgres,,False,False,False,False
...,...,...,...,...,...,...,...,...
68,pg_catalog,pg_largeobject,postgres,,True,False,False,False
69,information_schema,sql_parts,postgres,,False,False,False,False
70,information_schema,sql_features,postgres,,False,False,False,False
71,information_schema,sql_implementation_info,postgres,,False,False,False,False


In [14]:
# Part a — load into MongoDB
import os
from pymongo import MongoClient
from bson.json_util import loads, dumps
from dotenv import load_dotenv

# 1) Load your .env so creds are available in this notebook
load_dotenv(".env")  # if your .env lives elsewhere, give the absolute path

username = os.getenv("MONGO_INITDB_ROOT_USERNAME")
password = os.getenv("MONGO_INITDB_ROOT_PASSWORD")

auth_db   = "admin"    # root user authenticates against the admin DB
target_db = "history"  # assignment asks you to create/use this DB

# 2) Connect (Docker Mongo must be running and exposing 27017)
client = MongoClient(
    f"mongodb://{username}:{password}@localhost:27017/{target_db}?authSource={auth_db}"
)

db         = client[target_db]
collection = db["const"]

# 3) Remove any existing data, then insert the data in const_json
collection.delete_many({})




DeleteResult({'n': 0, 'ok': 1.0}, acknowledged=True)

In [None]:
# const_json may be a list[dict] or a JSON string — handle both safely
docs = loads(const_json) if isinstance(const_json, str) else [loads(dumps(d)) for d in const_json]
collection.insert_many(docs)

# 4) Quick sanity checks
print("Inserted/Count:", collection.count_documents({}))
print("Sample (no _id):", collection.find_one({}, {"_id": 0}))