#### Olist Schema Implementation

The datasets provided by Olist on Kaggle can be used in Star Schema with minimum effort. Once the csv files are uploaded in to the database, the steps to take are listed as below.

1) Find the features on each table

2) Create the Schema around the features

3) Decide the features that are going to be on the facttable and dimtables

4) Create the facttable referencing the other dim tables

In case of Olist, the dim_tables are already created. Lets get started with drawing the schema

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import configparser
import psycopg2

config = configparser.ConfigParser()
config.read('clusteroli.config')

db = config['POSTGRES']['PG_DB']
user = config['POSTGRES']['PG_UNAME']
passwd = config['POSTGRES']['PG_PASS']
port = config['POSTGRES']['PG_PORT']
host = config['POSTGRES']['PG_HOST']

In [4]:
credentials = "postgresql://{}:{}@{}:{}/{}".format(user,passwd,host,port,db)

#using psycopg2 to test connection since there are no tables
import psycopg2
try:
    conn = psycopg2.connect(host=host,dbname=db,user=user,password=passwd,port=port)
except Exception as e:
    print(e)
    
conn.set_session(autocommit=True)

try:
    cur = conn.cursor()
    
except:
    print(e)

#Helper functions to work with the database
def schemaGen(dataframe, schemaName):
    localSchema = pd.io.sql.get_schema(dataframe,schemaName)
    localSchema = localSchema.replace('TEXT','VARCHAR(255)').replace('INTEGER','NUMERIC').replace('\n','').replace('"',"")
    return "".join(localSchema)

#Using pandas read_sql for getting schema
def getSchema(tableName):
    schema = pd.read_sql("""SELECT * FROM information_schema.columns where table_name='{}'""".format(tableName),credentials)
    return schema

#Issue is in using pd.read_sql to write data to the database. so using psycopg2
def queryTable(query):
    try:
        schema = cur.execute(query)
        return 
    except Exception as e:
        print(e)
        
#This doesn't return anything

#Using the pd.read_sql for getting data from db
def queryBase(query):
    requiredTable = pd.read_sql(query,con=credentials)
    return requiredTable

#This returns the dataframe

In [31]:
#Get the tables from the database. There are 9 tables
dataTables = queryBase("""SELECT table_name FROM information_schema.tables""")[:9]

In [35]:
dataTables=dataTables.table_name.values

In [38]:
for name in dataTables:
    print(name)
    print(getSchema(name).column_name.values)

dim_location
['customer_zip_code_prefix' 'customer_id' 'customer_unique_id'
 'customer_city' 'customer_state']
dim_orders
['order_item_id' 'price' 'freight_value' 'shipping_limit_date' 'seller_id'
 'product_id' 'order_id']
dim_payment
['payment_sequential' 'payment_installments' 'payment_value' 'order_id'
 'payment_type']
dim_reviews
['review_score' 'order_id' 'review_id' 'review_comment_message'
 'review_creation_date' 'review_answer_timestamp' 'review_comment_title']
dim_process
['order_id' 'customer_id' 'order_status' 'order_purchase_timestamp'
 'order_approved_at' 'order_delivered_carrier_date'
 'order_delivered_customer_date' 'order_estimated_delivery_date']
dim_product
['product_width_cm' 'product_length_cm' 'product_height_cm'
 'product_name_lenght' 'product_description_lenght' 'product_photos_qty'
 'product_weight_g' 'product_category_name' 'product_id']
dim_seller
['seller_zip_code_prefix' 'seller_id' 'seller_city' 'seller_state']
dim_category
['product_category_name' 'product

![image.png](attachment:image.png)

In [None]:
# Before building the facttable, the keys that are referenced has to be primary key or key with 
#unique constraints using ALTER TABLE history ADD PRIMARY KEY (id)



In [42]:
createfact = """CREATE TABLE facttable(factkey BIGSERIAL PRIMARY KEY,
                        orderkey VARCHAR REFERENCES dim_process(order_id),
                        customerkey VARCHAR REFERENCES dim_location(customer_id),
                        productkey VARCHAR REFERENCES dim_product(product_id),
                        sellerkey VARCHAR REFERENCES dim_seller(seller_id))"""
queryTable(createfact)

In [55]:
checkquery = """SELECT order_id,product_id,seller_id
                FROM dim_orders AS do
                JOIN dim_process AS dp ON do.order_id = dp.order_id
                LIMIT 5"""
queryBase(checkquery)

ProgrammingError: (psycopg2.errors.SyntaxError) syntax error at or near "do"
LINE 2:                 FROM dim_orders AS do
                                           ^

[SQL: SELECT order_id,product_id,seller_id
                FROM dim_orders AS do
                JOIN dim_process AS dp ON do.order_id = dp.order_id
                LIMIT 5]
(Background on this error at: https://sqlalche.me/e/14/f405)

In [None]:
insertFact = """INSERT INTO facttable(orderkey, customerkey, productkey, sellerkey)
                SELECT dm.order_id AS orderkey, dl.customer_id AS customerkey,
                        dp.product_id AS productkey, ds.seller_id AS sellerkey
                FROM dim_process as dm
                JOIN dim_location as dl ON dl.customer_id = dm.customer_id
                JOIN dim_orders as do ON dm.order_id = do.order_id
                JOIN dim_product as dp ON do.product_id = dp.product_id
                JOIN dim_seller as ds ON do.seller_id = ds.seller_id"""