# Toy example

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

Create some mock data with attribute dependencies inside and between relations.

In [2]:
from phd.relation import Relation

passengers = Relation(
    name='passengers',
    data=[
        ('Swedish', 'Male', 'Blond'),
        ('Swedish', 'Female', 'Blond'),
        ('Swedish', 'Male', 'Blond'),
        ('Swedish', 'Female', 'Brown'),
        ('Swedish', 'Female', 'Blond'),
        ('American',  'Male', 'Brown'),
        ('American', 'Male', 'Dark '),
        ('American', 'Female', 'Brown'),
        ('American', 'Male', 'Brown'),
        ('American', 'Female', 'Blond'),
    ],
    columns=['nationality', 'gender', 'hair']
)

passengers

Unnamed: 0,nationality,gender,hair
0,Swedish,Male,Blond
1,Swedish,Female,Blond
2,Swedish,Male,Blond
3,Swedish,Female,Brown
4,Swedish,Female,Blond
5,American,Male,Brown
6,American,Male,Dark
7,American,Female,Brown
8,American,Male,Brown
9,American,Female,Blond


In [4]:
routes = Relation(
    name='routes',
    data=[
        ('Stockholm', 'Boston', 515),
        ('Stockholm', 'San Francisco', 830),
        ('Stockholm', 'New-York', 515),
        ('Fresno', 'Seattle', 130),
        ('Fresno', 'San Francisco', 60),
        ('Fresno', 'Portland', 110),
    ],
    columns=['origin', 'destination', 'minutes']
)

routes

Unnamed: 0,origin,destination,minutes
0,Stockholm,Boston,515
1,Stockholm,San Francisco,830
2,Stockholm,New-York,515
3,Fresno,Seattle,130
4,Fresno,San Francisco,60
5,Fresno,Portland,110


In [3]:
flights = Relation(
    name='flights',
    data=[
        (0, 0),
        (0, 1),
        (0, 2),
        (1, 0),
        (1, 1),
        (1, 2),
        (2, 2),
        (3, 0),
        (4, 1),
        (5, 3),
        (5, 5),
        (6, 3),
        (7, 3),
        (7, 5),
        (8, 4),
        (9, 4),
    ],
    columns=['passenger_id', 'route_id']
)

flights

Unnamed: 0,passenger_id,route_id
0,0,0
1,0,1
2,0,2
3,1,0
4,1,1
5,1,2
6,2,2
7,3,0
8,4,1
9,5,3


Insert the data into a PostgreSQL database.

In [5]:
import sqlalchemy

uri = 'postgresql://postgres:admin@localhost:5432/flights'
engine = sqlalchemy.create_engine(uri)
con = engine.connect()

In [6]:
for rel in (routes, passengers, flights):
    rel.to_sql(name=rel.name, index=True, index_label='id', con=con, if_exists='replace')

Run the `ANALYZE` statement so that PostgreSQL collects database statistics. 

In [5]:
con.execute('ANALYZE;');

Now let's train our estimators.

In [6]:
from phd.bn.estimator import BayesianNetworkEstimator
from phd.sampling.estimator import SamplingEstimator
from phd.textbook.estimator import TextbookEstimator


est_bn = BayesianNetworkEstimator(n_mcv=2, n_bins=2)
est_bn_duration = est_bn.build_from_engine(engine)

est_samp = SamplingEstimator()
est_samp_duration = est_samp.build_from_engine(engine)

est_tb = TextbookEstimator()
est_tb_duration = est_tb.build_from_engine(engine)

## Query 1

In [7]:
query = '''
    SELECT *
    FROM passengers p
    WHERE p.hair = 'Blond'
'''

filter_query = '''
    passengers.hair == 'Blond'
'''

In [8]:
truth = con.execute(query).rowcount
print('Truth:', truth)

Truth: 5


In [34]:
print('Bayesian network:', est_bn.estimate_selectivity('', filter_query))

Bayesian network: 5.0


In [35]:
print('Sampling:', est_samp.estimate_selectivity('', filter_query))

Sampling: 5.0


In [44]:
print('Textbook:', est_tb.estimate_selectivity('', filter_query))

Textbook: 5.0


## Query 2

In [45]:
query = '''
    SELECT *
    FROM passengers p
    WHERE p.hair = 'Blond'
    AND p.nationality = 'Swedish'
'''

filter_query = '''
    passengers.hair == 'Blond' and
    passengers.nationality == 'Swedish'
'''

In [46]:
truth = con.execute(query).rowcount
print('Truth:', truth)

Truth: 4


In [47]:
print('Bayesian network:', est_bn.estimate_selectivity('', filter_query))

Bayesian network: 4.0


In [48]:
print('Sampling:', est_samp.estimate_selectivity('', filter_query))

Sampling: 4.0


In [49]:
print('Textbook:', est_tb.estimate_selectivity('', filter_query))

Textbook: 2.5


## Query 3

In [56]:
query = '''
    SELECT *
    FROM passengers p, flights f
    WHERE p.hair = 'Blond'
    AND p.nationality = 'Swedish'
    AND p.id = f.passenger_id
'''

join_query = '''
    passengers.id == flights.passenger_id
'''

filter_query = '''
    passengers.hair == 'Blond' and
    passengers.nationality == 'Swedish'
'''

In [57]:
truth = con.execute(query).rowcount
print('Truth:', truth)

Truth: 8


In [68]:
print('Bayesian network:', est_bn.estimate_selectivity(join_query, filter_query))

Bayesian network: 6.4


In [69]:
print('Sampling:', est_samp.estimate_selectivity(join_query, filter_query))

Sampling: 6.4


In [70]:
print('Textbook:', est_tb.estimate_selectivity(join_query, filter_query))

Textbook: 4.0


## Query 4

In [71]:
query = '''
    SELECT *
    FROM passengers p, flights f, routes r
    WHERE p.hair = 'Blond'
    AND p.nationality = 'Swedish'
    AND p.id = f.passenger_id
    AND f.route_id = r.id
    AND r.origin = 'Stockholm'
'''

join_query = '''
    passengers.id == flights.passenger_id and
    flights.route_id == routes.id
'''

filter_query = '''
    passengers.hair == 'Blond' and
    passengers.nationality == 'Swedish' and
    routes.origin == 'Stockholm'
'''

In [72]:
truth = con.execute(query).rowcount
print('Truth:', truth)

Truth: 8


In [74]:
print('Bayesian network:', est_bn.estimate_selectivity(join_query, filter_query))

Bayesian network: 3.2


In [75]:
print('Sampling:', est_samp.estimate_selectivity(join_query, filter_query))

Sampling: 3.2


In [76]:
print('Textbook:', est_tb.estimate_selectivity(join_query, filter_query))

Textbook: 2.0
