# >>> Magic Commands in a Nutshell <<<

<h2>See what is available per line, and per cell</h2>

In [9]:
%lsmagic

Available line magics:
%alias  %alias_magic  %autocall  %automagic  %autosave  %bookmark  %cat  %cd  %clear  %colors  %config  %connect_info  %cp  %debug  %dhist  %dirs  %doctest_mode  %ed  %edit  %env  %gui  %hist  %history  %install_default_config  %install_ext  %install_profiles  %killbgscripts  %ldir  %less  %lf  %lk  %ll  %load  %load_ext  %loadpy  %logoff  %logon  %logstart  %logstate  %logstop  %ls  %lsmagic  %lx  %macro  %magic  %man  %matplotlib  %mkdir  %more  %mv  %notebook  %page  %pastebin  %pdb  %pdef  %pdoc  %pfile  %pinfo  %pinfo2  %popd  %pprint  %precision  %profile  %prun  %psearch  %psource  %pushd  %pwd  %pycat  %pylab  %qtconsole  %quickref  %recall  %rehashx  %reload_ext  %rep  %rerun  %reset  %reset_selective  %rm  %rmdir  %run  %save  %sc  %set_env  %store  %sx  %system  %tb  %time  %timeit  %unalias  %unload_ext  %who  %who_ls  %whos  %xdel  %xmode

Available cell magics:
%%!  %%HTML  %%SVG  %%bash  %%capture  %%debug  %%file  %%html  %%javascript  %%latex  %%

<h2>In case you need to display a "complex" formula</h2>

In [1]:
%%latex
$$c^{i+1}_t= \log \frac{p^{i+1}_t}{(1-p^{i+1}_t)} + \log \frac{(1-u^{i+1}_t)}{u^{i+1}_t}$$

<IPython.core.display.Latex object>

<h2>In case you need to execute a shell command</h2>

In [29]:
%%sh
date

Tue Apr  4 02:51:41 CDT 2017


# >>> Working with DataFrames <<<

In [1]:
import pixiedust
homes = pixiedust.sampleData("https://openobjectstore.mybluemix.net/misc/milliondollarhomes.csv")
cars = pixiedust.sampleData("https://github.com/ibm-cds-labs/open-data/raw/master/cars/cars.csv")

Pixiedust database opened successfully


Downloading 'https://openobjectstore.mybluemix.net/misc/milliondollarhomes.csv' from https://openobjectstore.mybluemix.net/misc/milliondollarhomes.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Creating pySpark DataFrame for 'https://openobjectstore.mybluemix.net/misc/milliondollarhomes.csv'. Please wait...
Successfully created pySpark DataFrame for 'https://openobjectstore.mybluemix.net/misc/milliondollarhomes.csv'
Downloading 'https://github.com/ibm-cds-labs/open-data/raw/master/cars/cars.csv' from https://github.com/ibm-cds-labs/open-data/raw/master/cars/cars.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Creating pySpark DataFrame for 'https://github.com/ibm-cds-labs/open-data/raw/master/cars/cars.csv'. Please wait...
Successfully created pySpark DataFrame for 'https://github.com/ibm-cds-labs/open-data/raw/master/cars/cars.csv'


In [5]:
cars.head(5)

[Row(mpg=18.0, cylinders=8, engine=307.0, horsepower=130, weight=3504, acceleration=12.0, year=70, origin=u'American', name=u'chevrolet chevelle malibu'),
 Row(mpg=15.0, cylinders=8, engine=350.0, horsepower=165, weight=3693, acceleration=11.5, year=70, origin=u'American', name=u'buick skylark 320'),
 Row(mpg=18.0, cylinders=8, engine=318.0, horsepower=150, weight=3436, acceleration=11.0, year=70, origin=u'American', name=u'plymouth satellite'),
 Row(mpg=16.0, cylinders=8, engine=304.0, horsepower=150, weight=3433, acceleration=12.0, year=70, origin=u'American', name=u'amc rebel sst'),
 Row(mpg=17.0, cylinders=8, engine=302.0, horsepower=140, weight=3449, acceleration=10.5, year=70, origin=u'American', name=u'ford torino')]

In [7]:
american_cars = cars.select(cars.cylinders, cars.mpg, cars.origin).filter(cars.origin == u'American')

In [8]:
american_cars.count()

254

<h2>Per group row count</h2>

In [9]:
american_cars.groupBy("cylinders").count().show()

+---------+-----+
|cylinders|count|
+---------+-----+
|        4|   72|
|        6|   74|
|        8|  108|
+---------+-----+



<h2>Per group aggregation</h2>

In [10]:
american_cars.groupBy("cylinders").max("mpg").show()

+---------+--------+
|cylinders|max(mpg)|
+---------+--------+
|        4|    39.0|
|        6|    38.0|
|        8|    26.6|
+---------+--------+



<h2>A composite where clause, i.e., a composite filter</h2>

In [14]:
display(american_cars.filter("cylinders >= 6 and mpg <= 16"))

cylinders,mpg,origin
8,15.0,American
8,16.0,American
8,15.0,American
8,14.0,American
8,14.0,American
8,14.0,American
8,15.0,American
8,16.0,American
8,16.0,American
8,15.0,American


# Joining DataFrames

<h2>We have two DataFrames df1 and df2.</h2> 

<h2>Contents:</h2>
<ul>
<i><h3>df1 contains sales data, i.e., the id of a specific product sold, the quantity sold, 
and the date the sale was made.</h3></i>

<i><h3>df2 contains product data, i.e., the id of each product and its price.</h3></i>
</ul>

<h2>Their schemas:</h2>
<ul>
<i><h3>df1 has an INT valued id column, an INT valued quantity column, and a STRING valued 
date column.</h3></i>

<i><h3>df2 has an INT valued id column, and a FLOAT valued price column.</h3></i>
</ul>

<h3>
Question: Write down the code to compute total revenue (which is total_quantity x price) 
per product using DataFrame operations.
</h3>

In [18]:
from pyspark.sql import Row

d1 = [Row(date='March 1, 2017', id=1, quantity=2), Row(date='March 2, 2017', id=1, quantity=5), 
      Row(date='March 2, 2017', id=2, quantity=1), Row(date='March 3, 2017', id=3, quantity=10), 
      Row(date='March 7, 2017', id=1, quantity=4), Row(date='March 7, 2017', id=2, quantity=4), 
      Row(date='March 7, 2017', id=3, quantity=2), Row(date='March 10, 2017', id=4, quantity=1), 
      Row(date='March 15, 2017', id=3, quantity=5), Row(date='March 15, 2017', id=2, quantity=1), 
      Row(date='March 20, 2017', id=4, quantity=2), Row(date='March 30, 2017', id=2, quantity=5), 
      Row(date='March 30, 2017', id=3, quantity=3), Row(date='March 30, 2017', id=1, quantity=5)] 

d2 = [Row(id=1, price=10), Row(id=2, price=8), 
      Row(id=3, price=5), Row(id=4, price=2)]

df1 = sqlContext.createDataFrame(d1)
df2 = sqlContext.createDataFrame(d2)

In [22]:
df1.join(df2, df1.id == df2.id).select(df1.id, (df1.quantity * df2.price).alias("rev")).groupBy("id").sum("rev").withColumnRenamed("sum(rev)", "TotalRevenue").show()

+---+------------+
| id|TotalRevenue|
+---+------------+
|  1|         160|
|  2|          88|
|  3|         100|
|  4|           6|
+---+------------+

