# Preamble

In [1]:
import sys
# pandas to manipulate SQL answer set
import pandas as pd
import time,os
# for Posgresql and other RDBMS
from sqlalchemy import create_engine,event,schema,Table,Column, Integer, Float, String, MetaData, TIMESTAMP, Date, text, inspect
from sqlalchemy_utils import database_exists,create_database
from sqlalchemy.orm import sessionmaker
from sqlalchemy.schema import CreateTable

In [2]:
# change working directory path to that of the current script file
abspath = os.path.abspath("__file__")
dname = os.path.dirname(abspath)
os.chdir(dname)

# Read and clean the Data

## Open the data file

In [3]:
# Load the Excel file
data_file_path = "./data.xlsx"
excel_file = pd.ExcelFile(data_file_path)

# Get the list of sheet names
excel_file.sheet_names

['city',
 'volunteer',
 'volunteer_range',
 'skill',
 'skill_assignment',
 'interest',
 'interest_assignment',
 'beneficiary',
 'request',
 'request_skill',
 'request_location',
 'volunteer_application']

## Import excel sheets into data frames

In [4]:
DB_dict = {}

for sheet_name in excel_file.sheet_names:
    DB_dict[sheet_name] = pd.read_excel(data_file_path, sheet_name=sheet_name)

display(DB_dict)

{'city':          name   id             geolocation
 0       Rusko  704    13.4134995/45.792650
 1    Hailuoto   72   -79.132557/-40.995129
 2      Liperi  426     47.244262/80.880444
 3      Ulvila  886   18.6986795/-17.160223
 4     Tammela  834     37.935520/12.226293
 5  Rautavaara  687   66.6004235/-62.724453
 6    Myrskylä  504   45.475847/-105.227997
 7      Säkylä  783  -14.344729/-104.974555,
 'volunteer':               id  birthdate  city_id                     name  \
 0    011095-974M 1995-10-01      426       Ulla Saari-Tiainen   
 1    200958-9326 1958-09-20      426           Ilona Nieminen   
 2    120791-9129 1991-07-12      687                Miia Repo   
 3    110166-9408 1966-01-11      886              Raili Lammi   
 4    070171-961M 1971-01-07      834  Henri Ruotsalainen-Mäki   
 ..           ...        ...      ...                      ...   
 186  020500A905H 2000-05-02       72           Mikko Heinonen   
 187  280183-963B 1983-01-28      783            Juhan

### Define primary keys for the imported relations

In [5]:
PK_dict = {}

PK_dict['city']= ["id"]
PK_dict['volunteer']= ["id"]
PK_dict['volunteer_range']= ["volunteer_id", "city_id"]
PK_dict['skill']= ["name"]
PK_dict['skill_assignment']= ["volunteer_id", "skill_name"]
PK_dict['interest']= ["name"]
PK_dict['interest_assignment']= ["interest_name", "volunteer_id"]
PK_dict['beneficiary']= ["id"]
PK_dict['request']= ["id"]
PK_dict['request_skill']= ["request_id", "skill_name"]
PK_dict['request_location']= ["request_id", "city_id"]
PK_dict['volunteer_application'] = ["id"]

assert (all([key in DB_dict.keys() for key in PK_dict]) and all([key in PK_dict.keys() for key in DB_dict.keys()]) ), \
    "Every relation in the DB must have its primary keys defined!"

### Check PK datatypes in each relation:

In [6]:
for relation in DB_dict.keys():
    pk_column = DB_dict[relation][PK_dict[relation]]
    print(relation, ":", pk_column.dtypes)

city : id    int64
dtype: object
volunteer : id    object
dtype: object
volunteer_range : volunteer_id    object
city_id          int64
dtype: object
skill : name    object
dtype: object
skill_assignment : volunteer_id    object
skill_name      object
dtype: object
interest : name    object
dtype: object
interest_assignment : interest_name    object
volunteer_id     object
dtype: object
beneficiary : id    int64
dtype: object
request : id    int64
dtype: object
request_skill : request_id     int64
skill_name    object
dtype: object
request_location : request_id    int64
city_id       int64
dtype: object
volunteer_application : id    int64
dtype: object


In [7]:
# helper to convert PK list elements to a proper string format
def pk_wrap(str_list):
    if len(str_list) < 2:
        return "("+str_list[0]+")"
    else:
        return str(tuple(str_list)).replace("'", "")
    

def add_PK_constraint(engine, relation):
    
    pk_attributes = pk_wrap(PK_dict[relation])

    # add a primary key constraint
    with engine.connect() as conn:
        alter_table_sql = text(f"""
        ALTER TABLE {relation}
        ADD CONSTRAINT {relation}_pkey PRIMARY KEY {pk_attributes};
        """)
          
        try:
            print("Trying to make", pk_attributes, "attributes the PK of the", relation, "relation.")
            conn.execute(alter_table_sql) 
            conn.commit()
            print("No errors detected...")
        except Exception as e:
          print(f"Error: {e}")


In [8]:
def verify_constraints(engine, relation):
    with engine.connect() as conn:
        verify_constraints_sql = text(f"""
        SELECT 
            kcu.table_schema,
            kcu.table_name,
            tco.constraint_name,
            kcu.column_name,
            kcu.ordinal_position
        FROM 
            information_schema.table_constraints tco
        JOIN 
            information_schema.key_column_usage kcu 
            ON kcu.constraint_name = tco.constraint_name
            AND kcu.constraint_schema = tco.constraint_schema
            AND kcu.constraint_name = tco.constraint_name
        WHERE 
            tco.constraint_type = 'PRIMARY KEY' 
            AND kcu.table_name = '{relation}'
        ORDER BY 
            kcu.table_schema,
            kcu.table_name,
            kcu.ordinal_position;
        """)

        # Execute the query and fetch the results
        result = conn.execute(verify_constraints_sql)

        # Print the results
        for row in result:
            print(row)

# Create the Databse

## Connect to the DB server

In [9]:
# Define your PostgreSQL server connection details
database_name = 'group_7_2024'
username = 'group_7_2024'
password = 'SNMsELrjkpQC'
host = 'dbcourse.cs.aalto.fi'  # or your server address
port = '5432'  # default PostgreSQL port


# Create a connection URL
connection_url = f'postgresql+psycopg2://{username}:{password}@{host}:{port}/{database_name}'

try:
    # Create an engine to test the connection
    engine = create_engine(connection_url) #.replace(f'/{database_name}', '/postgres'))  # Connect to the default 'postgres' database
    with engine.connect() as conn:
        print("Connection to PostgreSQL server successful.")
except Exception as e:
    print(f"Failed to connect to the PostgreSQL server: {e}")
    exit(1)

# Check if the database already exists
if not database_exists(connection_url):
    # Create the new database
    create_database(connection_url)
    print(f"Database '{database_name}' created successfully.")
else:
    print(f"Database '{database_name}' already exists.")



Connection to PostgreSQL server successful.
Database 'group_7_2024' already exists.


## Populate the DB with data

In [10]:
for relation in DB_dict.keys():
    DB_dict[relation].to_sql(relation, engine, if_exists='replace', index=False)
    add_PK_constraint(engine, relation)


Trying to make (id) attributes the PK of the city relation.
No errors detected...


KeyboardInterrupt: 

## Check the DB content

In [None]:
# Create an inspector
inspector = inspect(engine)

# Get the list of all tables
tables = inspector.get_table_names()

# Print the tables
print("Tables in the database:")
for table in tables:
    print(table)

Tables in the database:
city
volunteer
volunteer_range
skill
skill_assignment
interest
interest_assignment
beneficiary
request
request_skill
request_location
volunteer_application


## Close connection to the DB

In [None]:
# Dispose of the engine to close the connection
engine.dispose()
print("Connection to PostgreSQL server closed.")

Connection to PostgreSQL server closed.


: 