# Install JoinBoost and databases package.
In this demo, we use duckdb as the database.


In [1]:
%pip install joinboost==0.0.14
%pip install duckdb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting joinboost==0.0.14
  Downloading joinboost-0.0.14-py3-none-any.whl (18 kB)
Installing collected packages: joinboost
  Attempting uninstall: joinboost
    Found existing installation: joinboost 0.0.13
    Uninstalling joinboost-0.0.13:
      Successfully uninstalled joinboost-0.0.13
Successfully installed joinboost-0.0.14
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Download data and load data into database.

This step is only necessary for this demo as we don't have existing databases. JoinBoost to directly build models over your databases without data downloading/loading.

In [2]:
import duckdb
import urllib.request
from joinboost.executor import DuckdbExecutor
from joinboost.joingraph import JoinGraph
from joinboost.app import DecisionTree,GradientBoosting

urllib.request.urlretrieve("https://www.dropbox.com/s/kaovdndtevcvt83/holidays.csv?dl=1", "holidays.csv")
urllib.request.urlretrieve("https://www.dropbox.com/s/wh6amz4um7ieyqz/items.csv?dl=1", "items.csv")
urllib.request.urlretrieve("https://www.dropbox.com/s/ze6of1xqwslt8jb/oil.csv?dl=1", "oil.csv")
urllib.request.urlretrieve("https://www.dropbox.com/s/uiojfrc5c20gyrl/sales_small.csv?dl=1", "sales_small.csv")
urllib.request.urlretrieve("https://www.dropbox.com/s/cwy6z0b7rhsnrxb/stores.csv?dl=1", "stores.csv")
urllib.request.urlretrieve("https://www.dropbox.com/s/wwaoga17z70jb6l/train_small.csv?dl=1", "train_small.csv")
urllib.request.urlretrieve("https://www.dropbox.com/s/2bxto9wnetwnvqd/transactions.csv?dl=1", "transactions.csv")

con = duckdb.connect(database=':memory:')
con.execute("CREATE OR REPLACE TABLE holidays AS SELECT * FROM 'holidays.csv';")
con.execute("CREATE OR REPLACE TABLE oil AS SELECT * FROM 'oil.csv';")
con.execute("CREATE OR REPLACE TABLE transactions AS SELECT * FROM 'transactions.csv';")
con.execute("CREATE OR REPLACE TABLE stores AS SELECT * FROM 'stores.csv';")
con.execute("CREATE OR REPLACE TABLE items AS SELECT * FROM 'items.csv';")
con.execute("CREATE OR REPLACE TABLE sales AS SELECT * FROM 'sales_small.csv';")
con.execute("CREATE OR REPLACE TABLE train AS SELECT * FROM 'train_small.csv';")

<duckdb.DuckDBPyConnection at 0x7f2d94b7e330>

# Data exploration and Data Transformation
You can use the database SQL queries to show all available tables and their schema. You can perform data transformations over these tables.

In [4]:
con.execute("describe;").df()

Unnamed: 0,table_name,column_names,column_types,temporary
0,holidays,"[date, f2, htype, locale, locale_name, transfe...","[INTEGER, INTEGER, INTEGER, INTEGER, INTEGER, ...",False
1,items,"[class, f1, family, item_nbr, perishable]","[INTEGER, INTEGER, INTEGER, INTEGER, INTEGER]",False
2,oil,"[date, dcoilwtico, f3]","[INTEGER, INTEGER, INTEGER]",False
3,sales,"[Y, item_nbr, onpromotion, tid, unit_sales]","[DOUBLE, INTEGER, INTEGER, INTEGER, DOUBLE]",False
4,stores,"[city, cluster, f4, state, store_nbr, stype]","[INTEGER, INTEGER, INTEGER, INTEGER, INTEGER, ...",False
5,train,"[Y, city, class, cluster, date, dcoilwtico, f1...","[DOUBLE, INTEGER, INTEGER, INTEGER, INTEGER, I...",False
6,transactions,"[date, f5, store_nbr, tid, transactions]","[INTEGER, INTEGER, INTEGER, INTEGER, INTEGER]",False


# Train Decision Tree using JoinBoost
First, we need to specify the join graph: what are the relations and features you want to use. How these relations are supposed to be join. 

For instance, 
```
dataset.add_join("sales", "items", ["item_nbr"], ["item_nbr"])
```
says the join condition is: sales.item_nbr = items.item_nbr



In [6]:
exe = DuckdbExecutor(con, debug=False)
dataset = JoinGraph(exe=exe)
dataset.add_relation("sales", [], y = 'Y')
dataset.add_relation("holidays", ["htype", "locale", "locale_name", "transferred","f2"])
dataset.add_relation("oil", ["dcoilwtico","f3"])
dataset.add_relation("transactions", ["transactions","f5"])
dataset.add_relation("stores", ["city","state","stype","cluster","f4"])
dataset.add_relation("items", ["family","class","perishable","f1"])
dataset.add_join("sales", "items", ["item_nbr"], ["item_nbr"])
dataset.add_join("sales", "transactions", ["tid"], ["tid"])
dataset.add_join("transactions", "stores", ["store_nbr"], ["store_nbr"])
dataset.add_join("transactions", "holidays", ["date"], ["date"])
dataset.add_join("holidays", "oil", ["date"], ["date"])

depth = 3
reg = DecisionTree(learning_rate=1, max_leaves=2 ** depth, max_depth=depth)

reg.fit(dataset)
reg.compute_rmse('train')[0]

2535.0608391585865

# Train Gradient Boosting using JoinBoost
Next, we train Gradient Boosting of 10 iterations.

As shown, Gradient Boosting further reduces the rmse.

In [7]:
reg = GradientBoosting(learning_rate=1, max_leaves=2 ** depth, max_depth=depth, iteration = 10)
reg.fit(dataset)
reg.compute_rmse('train')[0]

786.8773440907675

# JoinBoost Internals
Internally, JoinBoost translates ML logics into SQL, and execute SQL queries directly in your databases without data movement.

To see these SQL queries, we can enable the debug mode


```
exe = DuckdbExecutor(con, debug=True)
```

such that are SQL queries executed are printed out.

In [9]:
exe = DuckdbExecutor(con, debug=True)
dataset = JoinGraph(exe=exe)
dataset.add_relation("sales", [], y = 'Y')
dataset.add_relation("holidays", ["htype", "locale", "locale_name", "transferred","f2"])
dataset.add_relation("oil", ["dcoilwtico","f3"])
dataset.add_relation("transactions", ["transactions","f5"])
dataset.add_relation("stores", ["city","state","stype","cluster","f4"])
dataset.add_relation("items", ["family","class","perishable","f1"])
dataset.add_join("sales", "items", ["item_nbr"], ["item_nbr"])
dataset.add_join("sales", "transactions", ["tid"], ["tid"])
dataset.add_join("transactions", "stores", ["store_nbr"], ["store_nbr"])
dataset.add_join("transactions", "holidays", ["date"], ["date"])
dataset.add_join("holidays", "oil", ["date"], ["date"])

depth = 3
reg = DecisionTree(learning_rate=1, max_leaves=2 ** depth, max_depth=depth)

reg.fit(dataset)
reg.compute_rmse('train')[0]

PRAGMA table_info(sales)
0.0023736953735351562
PRAGMA table_info(holidays)
0.0009341239929199219
PRAGMA table_info(oil)
0.0002970695495605469
PRAGMA table_info(transactions)
0.00022339820861816406
PRAGMA table_info(stores)
0.0001742839813232422
PRAGMA table_info(items)
0.00014972686767578125
SELECT SUM(Y) AS s, SUM(1) AS c
FROM sales

0.00036025047302246094
CREATE OR REPLACE TABLE joinboost_tmp_0 AS SELECT Y- (-4842.635841853514) AS s, 1 AS c, item_nbr AS item_nbr, tid AS tid
FROM sales

0.0015192031860351562
CREATE OR REPLACE TABLE joinboost_tmp_1 AS SELECT SUM(s) AS s, SUM(c) AS c, joinboost_tmp_0.item_nbr AS item_nbr
FROM joinboost_tmp_0
GROUP BY joinboost_tmp_0.item_nbr

0.0026786327362060547
CREATE OR REPLACE TABLE joinboost_tmp_2 AS SELECT SUM(s) AS s, SUM(c) AS c, joinboost_tmp_0.tid AS tid
FROM joinboost_tmp_0
GROUP BY joinboost_tmp_0.tid

0.00576019287109375
CREATE OR REPLACE TABLE joinboost_tmp_3 AS SELECT SUM(s) AS s, SUM(c) AS c, transactions.store_nbr AS store_nbr
FROM joi

2535.0608391585865