## Overview

This notebook will show you how to create and query a table or DataFrame that you uploaded to DBFS. [DBFS](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) is a Databricks File System that allows you to store data for querying inside of Databricks. This notebook assumes that you have a file already inside of DBFS that you would like to read from.

This notebook is written in **Python** so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` syntax. Python, Scala, SQL, and R are all supported.

In [0]:
# File location and type
file_location = "/FileStore/tables/data_with_features.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ";"

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

import pyspark.sql.functions as F

df = df.withColumn("decision_date", F.to_date("decision_date", "yyyy-MM-dd HH:mm:ss"))
df = df.withColumn("registration_date", F.to_date("registration_date", "yyyy-MM-dd HH:mm:ss"))

# Create a view or table
temp_table_name = "SQL_data"

df.createOrReplaceTempView(temp_table_name)

df.dtypes

In [0]:
df.withColumn(
    "brand2",
    col("brand").rlike("MERCED")).show()

In [0]:
from pyspark.sql.functions import *
df = df.withColumn('brand2', regexp_extract(col('brand'), 'MCLAREN', 1))
temp_table_name = "SQL_data"

df.createOrReplaceTempView(temp_table_name)

In [0]:
%sql
select mean(car_tax), mean(power_KW), brand2 from SQL_data group by brand2
                                          order by mean(car_tax) desc

In [0]:
%sql

/* Query the created temp table in a SQL cell */

select COUNT(decision_date), decision_date from SQL_data group by decision_date order by decision_date

count(decision_date),decision_date
72,2012-01-02
112,2012-01-03
114,2012-01-04
72,2012-01-05
76,2012-01-09
77,2012-01-10
72,2012-01-11
68,2012-01-12
89,2012-01-13
77,2012-01-16


In [0]:


%sql
select COUNT(brand), brand from SQL_data group by brand
                                

count(brand),brand
11,STUDEBAKER
1,DMC
1,EXCALIBUR
4,MG ROVER
111,ROVER
38,LAMBORGHINI
1,ARMSTRONG
2468,PORSCHE
3,ALPINA
2,DE LOREAN


In [0]:
%sql
select COUNT(decision_date), year(decision_date), month(decision_date) from SQL_data 
                                  group by year(decision_date), month(decision_date)
                                  order by year(decision_date), month(decision_date)

count(decision_date),year(decision_date),month(decision_date)
1779,2012,1
1685,2012,2
2536,2012,3
1999,2012,4
1909,2012,5
1485,2012,6
1713,2012,7
2413,2012,8
1683,2012,9
1988,2012,10


In [0]:
%sql
select count(brand), brand from SQL_data group by brand
                                          order by count(brand) desc
                                  


count(brand),brand
57056,MERCEDES-BENZ
39400,BMW
31749,VOLVO
22398,AUDI
20766,VOLKSWAGEN
10891,TOYOTA
8760,FORD
4997,MITSUBISHI
4106,SKODA
4082,SAAB


In [0]:
%sql
select sum(car_tax), brand from SQL_data group by brand
                                          order by sum(car_tax) desc

sum(car_tax),brand
347156110.4600002,MERCEDES-BENZ
239683517.67000017,BMW
143020435.45999995,AUDI
130071362.1799998,VOLVO
73004072.09000006,VOLKSWAGEN
37694741.81999997,PORSCHE
34997085.95000003,TOYOTA
28884757.06,FORD
17870474.18,LAND ROVER
16235486.660000002,LEXUS


In [0]:
%sql

update SQL_data SET brand=REGEXP_REPLACE(brand, 'MC%', 'MCLAREN') WHERE brand REGEXP 'MC%'

In [0]:
%sql
select mean(car_tax), mean(power_KW), brand from SQL_data group by brand
                                          order by mean(car_tax) desc

mean(car_tax),mean(power_KW),brand
124859.82,419.0,MC LAREN
96159.69,446.0,MCLAREN
60134.77947368421,386.6315789473685,LAMBORGHINI
57408.72823170732,344.2125,FERRARI
48762.13333333333,360.55555555555554,MERCEDES-AMG
46953.1525,438.75,MAYBACH
45375.63,250.0,LANDROVER
35886.18525,318.02564102564105,ASTON MARTIN
35363.67818181818,364.4270833333333,BENTLEY
33499.2752631579,215.7142857142857,ROLLS ROYCE


In [0]:
%sql
SELECT brand FROM SQL_data WHERE brand like '%SAAB%' group by brand

brand
SAAB
SAAB-LANCIA
SAAB 9-3


In [0]:
%sql
SELECT brand FROM SQL_data WHERE brand like '%HUYNDAI%' or brand like 'HYUNDAI' group by brand

brand
HYUNDAI
HUYNDAI


In [0]:
%sql
SELECT brand FROM SQL_data WHERE brand like 'TESLA%' group by brand

brand
TESLA
TESLA MOTORS


In [0]:
%sql
SELECT brand FROM SQL_data WHERE brand like '%MERCED%' or brand like 'MB' or brand like '%M-B' group by brand

brand
MERCEDES BENZ REMETZ
MERCEDES BENZ
MERCEDES-BENZ
MERCEDES
MERCEDES AMG
M-B
MB
MERCEDEZ BENZ
MERCEDEZS BENZ
MERCED-BENZ


In [0]:
%sql
SELECT brand FROM SQL_data WHERE brand like '%FERR%' group by brand

brand
FERRARI
FERRARI 360 F


In [0]:
%sql
SELECT brand FROM SQL_data WHERE brand like 'POR%' group by brand

brand
PORSCHE
PORCHE
PORCSHE
PORSHE


In [0]:
%sql
SELECT brand FROM SQL_data WHERE brand like 'ROL%' group by brand

brand
ROLLS ROOYCE
ROLLS ROYCE
ROLLS-ROYCE
ROLLS ROYES


In [0]:
%sql
SELECT brand FROM SQL_data WHERE brand like '%ROV%' group by brand

brand
MG ROVER
ROVER
RANGE ROVER
ROVER MG
LANDROVER
LAND ROVER
MG ROVER ZT-T V8
MG ROVER MGF
RANGE-ROVER
JAGUAR LAND ROVER


In [0]:
%sql
SELECT brand FROM SQL_data WHERE brand like '%JAG%' group by brand

brand
JAGUAR
JAGUAR LAND ROVER
JAGUAR LAND ROVER L
JAGUAR LAND ROVER LI


In [0]:
%sql
SELECT brand FROM SQL_data WHERE brand like 'BMW%' or brand like 'ALPINA' group by brand

brand
ALPINA
BMW
BMW ALPINA
