d-sandbox

<div style="text-align: center; line-height: 0; padding-top: 9px;">
  <img src="https://databricks.com/wp-content/uploads/2018/03/db-academy-rgb-1200px.png" alt="Databricks Learning" style="width: 600px">
</div>

In [0]:
%run ../Includes/Classroom-Setup

In [0]:
%sql
DROP TABLE IF EXISTS People10M;
CREATE TABLE People10M
USING csv
OPTIONS (
path "/mnt/training/dataframes/people-10m.csv",
header "true");

DROP TABLE IF EXISTS ssaNames;
CREATE TABLE ssaNames USING parquet OPTIONS (
  path "/mnt/training/ssn/names.parquet",
  header "true"
);

### Catalog Error

In [0]:
%sql
SELECT
  firstName,
  lastName,
  birthDate
FROM
  People10M
WHERE
  year(birthDate) > 1990
  AND gender = 'F'

firstName,lastName,birthDate
An,Cowper,1992-02-08T05:00:00.000Z
Caroyln,Cardon,1994-05-15T04:00:00.000Z
Yesenia,Goldring,1997-07-09T04:00:00.000Z
Hedwig,Pendleberry,1998-12-02T05:00:00.000Z
Kala,Lyfe,1994-06-23T04:00:00.000Z
Gussie,McKeeman,1991-11-15T05:00:00.000Z
Pansy,Shrieves,1991-05-24T04:00:00.000Z
Chung,Dautry,1998-01-12T05:00:00.000Z
Erica,O'Drought,1991-03-08T05:00:00.000Z
Katelyn,Pocklington,1994-01-16T05:00:00.000Z


### Plan Optimization Example

In [0]:
%sql
CREATE OR REPLACE TEMPORARY VIEW joined AS
SELECT People10m.firstName,
  to_date(birthDate) AS date
FROM People10m
  JOIN ssaNames ON People10m.firstName = ssaNames.firstName;

CREATE OR REPLACE TEMPORARY VIEW filtered AS
SELECT firstName,count(firstName)
FROM joined
WHERE
  date >= "1980-01-01"
GROUP BY
  firstName, date;


In [0]:
%sql
SELECT * FROM  filtered;

firstName,count(firstName)
Ellan,49
Charline,117
Latisha,72
Tonita,73
Gwenn,76
Nidia,67
Torri,91
Hannah,170
Justine,567
Allene,132


In [0]:
%sql
CACHE TABLE filtered;

In [0]:
%sql
SELECT * FROM filtered;

firstName,count(firstName)
Ellan,49
Charline,117
Latisha,72
Tonita,73
Gwenn,76
Nidia,67
Torri,91
Hannah,170
Justine,567
Allene,132


In [0]:
%sql
SELECT * FROM filtered WHERE firstName = "Latisha";

firstName,count(firstName)
Latisha,72
Latisha,72
Latisha,72
Latisha,72
Latisha,72
Latisha,72
Latisha,72
Latisha,72
Latisha,72
Latisha,72


In [0]:
%sql
UNCACHE TABLE IF EXISTS filtered;

In [0]:
%sql
SELECT * FROM filtered WHERE firstName = "Latisha";

firstName,count(firstName)
Latisha,72
Latisha,72
Latisha,72
Latisha,72
Latisha,72
Latisha,72
Latisha,72
Latisha,72
Latisha,72
Latisha,72


## Set Partitions

In [0]:
%sql
DROP TABLE IF EXISTS bikeShare;
CREATE TABLE bikeShare
USING csv
OPTIONS (
  path "/mnt/training/bikeSharing/data-001/hour.csv",
  header "true")

In [0]:
%sql
SELECT
  *
FROM
  bikeShare
WHERE
  hr = 10

instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
11,2011-01-01,1,0,1,10,0,6,0,1,0.38,0.3939,0.76,0.2537,12,24,36
34,2011-01-02,1,0,1,10,0,0,0,2,0.36,0.3485,0.81,0.2239,7,46,53
56,2011-01-03,1,0,1,10,0,1,1,1,0.18,0.1667,0.43,0.2537,11,33,44
79,2011-01-04,1,0,1,10,0,2,1,2,0.16,0.1364,0.69,0.3284,5,37,42
102,2011-01-05,1,0,1,10,0,3,1,1,0.22,0.197,0.37,0.3284,4,53,57
125,2011-01-06,1,0,1,10,0,4,1,1,0.2,0.2576,0.47,0.0,3,42,45
148,2011-01-07,1,0,1,10,0,5,1,1,0.22,0.197,0.37,0.3284,16,47,63
172,2011-01-08,1,0,1,10,0,6,0,2,0.18,0.197,0.8,0.1642,5,56,61
196,2011-01-09,1,0,1,10,0,0,0,1,0.14,0.1061,0.43,0.3881,0,49,49
220,2011-01-10,1,0,1,10,0,1,1,2,0.14,0.1212,0.5,0.2985,0,31,31


unreal job for spark is:

In [0]:
%sql
DROP TABLE IF EXISTS bikeShare_partitioned;
CREATE TABLE bikeShare_partitioned
PARTITIONED BY (p_hr)
  AS
SELECT
  instant,
  dteday,
  season, 
  yr,
  mnth,
  hr as p_hr,
  holiday,
  weekday, 
  workingday,
  weathersit,
  temp
FROM
  bikeShare

In [0]:
%sql
SELECT * FROM bikeShare_partitioned WHERE p_hr = 10

instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,p_hr
11,2011-01-01,1,0,1,0,6,0,1,0.38,10
34,2011-01-02,1,0,1,0,0,0,2,0.36,10
56,2011-01-03,1,0,1,0,1,1,1,0.18,10
79,2011-01-04,1,0,1,0,2,1,2,0.16,10
102,2011-01-05,1,0,1,0,3,1,1,0.22,10
125,2011-01-06,1,0,1,0,4,1,1,0.2,10
148,2011-01-07,1,0,1,0,5,1,1,0.22,10
172,2011-01-08,1,0,1,0,6,0,2,0.18,10
196,2011-01-09,1,0,1,0,0,0,1,0.14,10
220,2011-01-10,1,0,1,0,1,1,2,0.14,10


## Beware of small files!

In [0]:
%sql
DROP TABLE IF EXISTS bikeShare_parquet;
CREATE TABLE bikeShare
PARTITIONED BY (p_instant)
  AS
SELECT
  instant AS p_instant,
  dteday,
  season, 
  yr,
  mnth,
  hr
  holiday,
  weekday, 
  workingday,
  weathersit,
  temp
FROM
  bikeShare_csv

In [0]:
%run ../Includes/Classroom-Cleanup

## Citations
Bike Sharing Data<br>

[1] Fanaee-T, Hadi, and Gama, Joao, Event labeling combining ensemble detectors and background knowledge, Progress in Artificial Intelligence (2013): pp. 1-15, Springer Berlin Heidelberg, doi:10.1007/s13748-013-0040-3.

@article{ year={2013}, issn={2192-6352}, journal={Progress in Artificial Intelligence}, doi={10.1007/s13748-013-0040-3}, title={Event labeling combining ensemble detectors and background knowledge}, url={http://dx.doi.org/10.1007/s13748-013-0040-3}, publisher={Springer Berlin Heidelberg}, keywords={Event labeling; Event detection; Ensemble learning; Background knowledge}, author={Fanaee-T, Hadi and Gama, Joao}, pages={1-15} }

-sandbox
&copy; 2020 Databricks, Inc. All rights reserved.<br/>
Apache, Apache Spark, Spark and the Spark logo are trademarks of the <a href="http://www.apache.org/">Apache Software Foundation</a>.<br/>
<br/>
<a href="https://databricks.com/privacy-policy">Privacy Policy</a> | <a href="https://databricks.com/terms-of-use">Terms of Use</a> | <a href="http://help.databricks.com/">Support</a>