# Data Processing

## Import modules

#### 執行 row 2、4即可以跑

In [1]:
# import SparkSessionv
from pyspark.sql import SparkSession

In [1]:
import pyspark.sql.functions as fn 
from pyspark.sql.types import StringType,DoubleType,IntegerType

## Set spark session

In [3]:
# only for standalone(lightweight) pyspark
# create spar session object
spark=SparkSession.builder.appName('data_processing').getOrCreate()

In [2]:
spark.sparkContext.appName

'PySparkShell'

## Load data

In [3]:
# Load csv Dataset 
df=spark.read.csv('data/sample_data.csv',inferSchema=True,header=True)
df.createOrReplaceTempView("dfTable")

## Inspect data

In [6]:
# columns of dataframe
df.columns

['ratings', 'age', 'experience', 'family', 'mobile']

In [7]:
# shape of dataset
df.count(),len(df.columns)

(33, 5)

In [8]:
# print dataframe schema
df.printSchema()

root
 |-- ratings: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: double (nullable = true)
 |-- family: integer (nullable = true)
 |-- mobile: string (nullable = true)



In [9]:
# display fisrt few rows of dataframe
df.show()
#df.show(10)

+-------+---+----------+------+-------+
|ratings|age|experience|family| mobile|
+-------+---+----------+------+-------+
|      3| 32|       9.0|     3|   Vivo|
|      3| 27|      13.0|     3|  Apple|
|      4| 22|       2.5|     0|Samsung|
|      4| 37|      16.5|     4|  Apple|
|      5| 27|       9.0|     1|     MI|
|      4| 27|       9.0|     0|   Oppo|
|      5| 37|      23.0|     5|   Vivo|
|      5| 37|      23.0|     5|Samsung|
|      3| 22|       2.5|     0|  Apple|
|      3| 27|       6.0|     0|     MI|
|      2| 27|       6.0|     2|   Oppo|
|      5| 27|       6.0|     2|Samsung|
|      3| 37|      16.5|     5|  Apple|
|      5| 27|       6.0|     0|     MI|
|      4| 22|       6.0|     1|   Oppo|
|      4| 37|       9.0|     2|Samsung|
|      4| 27|       6.0|     1|  Apple|
|      1| 37|      23.0|     5|     MI|
|      2| 42|      23.0|     2|   Oppo|
|      4| 37|       6.0|     0|   Vivo|
+-------+---+----------+------+-------+
only showing top 20 rows



## Grouping data

In [7]:
df.groupBy('mobile').count().explain(extended=True)

== Parsed Logical Plan ==
'Aggregate ['mobile], [unresolvedalias('mobile, None), count(1) AS count#65L]
+- Relation[ratings#16,age#17,experience#18,family#19,mobile#20] csv

== Analyzed Logical Plan ==
mobile: string, count: bigint
Aggregate [mobile#20], [mobile#20, count(1) AS count#65L]
+- Relation[ratings#16,age#17,experience#18,family#19,mobile#20] csv

== Optimized Logical Plan ==
Aggregate [mobile#20], [mobile#20, count(1) AS count#65L]
+- Project [mobile#20]
   +- Relation[ratings#16,age#17,experience#18,family#19,mobile#20] csv

== Physical Plan ==
*(2) HashAggregate(keys=[mobile#20], functions=[count(1)], output=[mobile#20, count#65L])
+- Exchange hashpartitioning(mobile#20, 200), ENSURE_REQUIREMENTS, [id=#94]
   +- *(1) HashAggregate(keys=[mobile#20], functions=[partial_count(1)], output=[mobile#20, count#69L])
      +- FileScan csv [mobile#20] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[file:/home/hadoop/sparkcodes/data/sample_data.csv], Partiti

In [10]:
# group by one column
df.groupBy('mobile').count().show(5)

+-------+-----+
| mobile|count|
+-------+-----+
|     MI|    8|
|   Oppo|    7|
|Samsung|    6|
|   Vivo|    5|
|  Apple|    7|
+-------+-----+



In [9]:
# sort value counts
df.groupBy('mobile').count().orderBy('count',ascending=False).explain(extended=True)

== Parsed Logical Plan ==
'Sort ['count DESC NULLS LAST], true
+- Aggregate [mobile#20], [mobile#20, count(1) AS count#102L]
   +- Relation[ratings#16,age#17,experience#18,family#19,mobile#20] csv

== Analyzed Logical Plan ==
mobile: string, count: bigint
Sort [count#102L DESC NULLS LAST], true
+- Aggregate [mobile#20], [mobile#20, count(1) AS count#102L]
   +- Relation[ratings#16,age#17,experience#18,family#19,mobile#20] csv

== Optimized Logical Plan ==
Sort [count#102L DESC NULLS LAST], true
+- Aggregate [mobile#20], [mobile#20, count(1) AS count#102L]
   +- Project [mobile#20]
      +- Relation[ratings#16,age#17,experience#18,family#19,mobile#20] csv

== Physical Plan ==
*(3) Sort [count#102L DESC NULLS LAST], true, 0
+- Exchange rangepartitioning(count#102L DESC NULLS LAST, 200), ENSURE_REQUIREMENTS, [id=#143]
   +- *(2) HashAggregate(keys=[mobile#20], functions=[count(1)], output=[mobile#20, count#102L])
      +- Exchange hashpartitioning(mobile#20, 200), ENSURE_REQUIREMENTS, [id

In [11]:
# sort value counts
df.groupBy('mobile').count().orderBy('count',ascending=False).show(5)

+-------+-----+
| mobile|count|
+-------+-----+
|     MI|    8|
|   Oppo|    7|
|  Apple|    7|
|Samsung|    6|
|   Vivo|    5|
+-------+-----+



In [12]:
# calculate statistical measures
df.groupBy('mobile').mean().show(5)

+-------+------------------+------------------+------------------+------------------+
| mobile|      avg(ratings)|          avg(age)|   avg(experience)|       avg(family)|
+-------+------------------+------------------+------------------+------------------+
|     MI|               3.5|            30.125|           10.1875|             1.375|
|   Oppo| 2.857142857142857|28.428571428571427|10.357142857142858|1.4285714285714286|
|Samsung| 4.166666666666667|28.666666666666668| 8.666666666666666|1.8333333333333333|
|   Vivo|               4.2|              36.0|              11.4|               1.8|
|  Apple|3.4285714285714284|30.571428571428573|              11.0|2.7142857142857144|
+-------+------------------+------------------+------------------+------------------+



In [13]:
# calculate statistical measures
df.groupBy('mobile').sum().show()

+-------+------------+--------+---------------+-----------+
| mobile|sum(ratings)|sum(age)|sum(experience)|sum(family)|
+-------+------------+--------+---------------+-----------+
|     MI|          28|     241|           81.5|         11|
|   Oppo|          20|     199|           72.5|         10|
|Samsung|          25|     172|           52.0|         11|
|   Vivo|          21|     180|           57.0|          9|
|  Apple|          24|     214|           77.0|         19|
+-------+------------+--------+---------------+-----------+



In [8]:
# calculate statistical measures
df.groupBy('mobile').max().explain(extended=True)

== Parsed Logical Plan ==
'Aggregate ['mobile], [unresolvedalias('mobile, None), max(ratings#16) AS max(ratings)#79, max(age#17) AS max(age)#80, max(experience#18) AS max(experience)#81, max(family#19) AS max(family)#82]
+- Relation[ratings#16,age#17,experience#18,family#19,mobile#20] csv

== Analyzed Logical Plan ==
mobile: string, max(ratings): int, max(age): int, max(experience): double, max(family): int
Aggregate [mobile#20], [mobile#20, max(ratings#16) AS max(ratings)#79, max(age#17) AS max(age)#80, max(experience#18) AS max(experience)#81, max(family#19) AS max(family)#82]
+- Relation[ratings#16,age#17,experience#18,family#19,mobile#20] csv

== Optimized Logical Plan ==
Aggregate [mobile#20], [mobile#20, max(ratings#16) AS max(ratings)#79, max(age#17) AS max(age)#80, max(experience#18) AS max(experience)#81, max(family#19) AS max(family)#82]
+- Relation[ratings#16,age#17,experience#18,family#19,mobile#20] csv

== Physical Plan ==
*(2) HashAggregate(keys=[mobile#20], functions=[ma

In [14]:
# calculate statistical measures
df.groupBy('mobile').max().show()

+-------+------------+--------+---------------+-----------+
| mobile|max(ratings)|max(age)|max(experience)|max(family)|
+-------+------------+--------+---------------+-----------+
|     MI|           5|      42|           23.0|          5|
|   Oppo|           4|      42|           23.0|          2|
|Samsung|           5|      37|           23.0|          5|
|   Vivo|           5|      37|           23.0|          5|
|  Apple|           4|      37|           16.5|          5|
+-------+------------+--------+---------------+-----------+



In [15]:
# calculate statistical measures
df.groupBy('mobile').min().show()

+-------+------------+--------+---------------+-----------+
| mobile|min(ratings)|min(age)|min(experience)|min(family)|
+-------+------------+--------+---------------+-----------+
|     MI|           1|      27|            2.5|          0|
|   Oppo|           2|      22|            6.0|          0|
|Samsung|           2|      22|            2.5|          0|
|   Vivo|           3|      32|            6.0|          0|
|  Apple|           3|      22|            2.5|          0|
+-------+------------+--------+---------------+-----------+



In [16]:
# use spark sql
spark.sql('''select mobile, count(*) as count from dfTable
        group by mobile''').show()

+-------+-----+
| mobile|count|
+-------+-----+
|     MI|    8|
|   Oppo|    7|
|Samsung|    6|
|   Vivo|    5|
|  Apple|    7|
+-------+-----+



In [17]:
# use spark sql
spark.sql('''select mobile, min(experience), min(age) from dfTable
        group by mobile''').show()

+-------+---------------+--------+
| mobile|min(experience)|min(age)|
+-------+---------------+--------+
|     MI|            2.5|      27|
|   Oppo|            6.0|      22|
|Samsung|            2.5|      22|
|   Vivo|            6.0|      32|
|  Apple|            2.5|      22|
+-------+---------------+--------+



In [12]:
df.groupBy('mobile').min('experience','age').show()

+-------+---------------+--------+
| mobile|min(experience)|min(age)|
+-------+---------------+--------+
|     MI|            2.5|      27|
|   Oppo|            6.0|      22|
|Samsung|            2.5|      22|
|   Vivo|            6.0|      32|
|  Apple|            2.5|      22|
+-------+---------------+--------+



In [18]:
# Aggregation
df.groupBy('mobile').agg({'experience':'sum'}).show()

+-------+---------------+
| mobile|sum(experience)|
+-------+---------------+
|     MI|           81.5|
|   Oppo|           72.5|
|Samsung|           52.0|
|   Vivo|           57.0|
|  Apple|           77.0|
+-------+---------------+



In [18]:
# Aggregation
df.groupBy('mobile').agg({'experience':'sum'}).show()

+-------+---------------+
| mobile|sum(experience)|
+-------+---------------+
|     MI|           81.5|
|   Oppo|           72.5|
|Samsung|           52.0|
|   Vivo|           57.0|
|  Apple|           77.0|
+-------+---------------+



## Compare 

In [13]:
df.groupBy('mobile').min('experience','age').explain(extended=True)

== Parsed Logical Plan ==
'Aggregate ['mobile], [unresolvedalias('mobile, None), min(experience#18) AS min(experience)#173, min(age#17) AS min(age)#174]
+- Relation[ratings#16,age#17,experience#18,family#19,mobile#20] csv

== Analyzed Logical Plan ==
mobile: string, min(experience): double, min(age): int
Aggregate [mobile#20], [mobile#20, min(experience#18) AS min(experience)#173, min(age#17) AS min(age)#174]
+- Relation[ratings#16,age#17,experience#18,family#19,mobile#20] csv

== Optimized Logical Plan ==
Aggregate [mobile#20], [mobile#20, min(experience#18) AS min(experience)#173, min(age#17) AS min(age)#174]
+- Project [age#17, experience#18, mobile#20]
   +- Relation[ratings#16,age#17,experience#18,family#19,mobile#20] csv

== Physical Plan ==
*(2) HashAggregate(keys=[mobile#20], functions=[min(experience#18), min(age#17)], output=[mobile#20, min(experience)#173, min(age)#174])
+- Exchange hashpartitioning(mobile#20, 200), ENSURE_REQUIREMENTS, [id=#237]
   +- *(1) HashAggregate(key

In [14]:
# use spark sql
spark.sql('''select mobile, min(experience), min(age) from dfTable
        group by mobile''').explain(extended=True)

== Parsed Logical Plan ==
'Aggregate ['mobile], ['mobile, unresolvedalias('min('experience), None), unresolvedalias('min('age), None)]
+- 'UnresolvedRelation [dfTable], [], false

== Analyzed Logical Plan ==
mobile: string, min(experience): double, min(age): int
Aggregate [mobile#20], [mobile#20, min(experience#18) AS min(experience)#184, min(age#17) AS min(age)#185]
+- SubqueryAlias dftable
   +- Relation[ratings#16,age#17,experience#18,family#19,mobile#20] csv

== Optimized Logical Plan ==
Aggregate [mobile#20], [mobile#20, min(experience#18) AS min(experience)#184, min(age#17) AS min(age)#185]
+- Project [age#17, experience#18, mobile#20]
   +- Relation[ratings#16,age#17,experience#18,family#19,mobile#20] csv

== Physical Plan ==
*(2) HashAggregate(keys=[mobile#20], functions=[min(experience#18), min(age#17)], output=[mobile#20, min(experience)#184, min(age)#185])
+- Exchange hashpartitioning(mobile#20, 200), ENSURE_REQUIREMENTS, [id=#256]
   +- *(1) HashAggregate(keys=[mobile#20], 

In [15]:
# Aggregation
df.groupBy('mobile').agg({'experience':'min','age':'min'}).explain(extended=True)

== Parsed Logical Plan ==
'Aggregate ['mobile], [unresolvedalias('mobile, None), 'min(experience#18) AS min(experience)#198, 'min(age#17) AS min(age)#199]
+- Relation[ratings#16,age#17,experience#18,family#19,mobile#20] csv

== Analyzed Logical Plan ==
mobile: string, min(experience): double, min(age): int
Aggregate [mobile#20], [mobile#20, min(experience#18) AS min(experience)#198, min(age#17) AS min(age)#199]
+- Relation[ratings#16,age#17,experience#18,family#19,mobile#20] csv

== Optimized Logical Plan ==
Aggregate [mobile#20], [mobile#20, min(experience#18) AS min(experience)#198, min(age#17) AS min(age)#199]
+- Project [age#17, experience#18, mobile#20]
   +- Relation[ratings#16,age#17,experience#18,family#19,mobile#20] csv

== Physical Plan ==
*(2) HashAggregate(keys=[mobile#20], functions=[min(experience#18), min(age#17)], output=[mobile#20, min(experience)#198, min(age)#199])
+- Exchange hashpartitioning(mobile#20, 200), ENSURE_REQUIREMENTS, [id=#275]
   +- *(1) HashAggregate(k

## Load and Inspect data

In [17]:
# Load csv Dataset
rtdf=spark.read.csv('data/online_retail_dataset.csv',inferSchema=True,header=True)
rtdf.createOrReplaceTempView("rtTable")

In [20]:
# columns of dataframe
rtdf.columns

['InvoiceNo',
 'StockCode',
 'Description',
 'Quantity',
 'InvoiceDate',
 'UnitPrice',
 'CustomerID',
 'Country']

In [21]:
# shape of dataset
rtdf.count(),len(rtdf.columns)

(541909, 8)

In [22]:
# print dataframe schema
rtdf.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)



In [23]:
# display fisrt few rows of dataframe
rtdf.show()
#rtdf.show(10)

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|12/1/2010 8:26|     7.65|     17850|United Kingdom|
|   536365|    21730|GLASS STAR FROSTE...|       6|12/1/2010 8:26|     4.

In [18]:
rtdf.show(truncate=False)

+---------+---------+-----------------------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                        |Quantity|InvoiceDate   |UnitPrice|CustomerID|Country       |
+---------+---------+-----------------------------------+--------+--------------+---------+----------+--------------+
|536365   |85123A   |WHITE HANGING HEART T-LIGHT HOLDER |6       |12/1/2010 8:26|2.55     |17850     |United Kingdom|
|536365   |71053    |WHITE METAL LANTERN                |6       |12/1/2010 8:26|3.39     |17850     |United Kingdom|
|536365   |84406B   |CREAM CUPID HEARTS COAT HANGER     |8       |12/1/2010 8:26|2.75     |17850     |United Kingdom|
|536365   |84029G   |KNITTED UNION FLAG HOT WATER BOTTLE|6       |12/1/2010 8:26|3.39     |17850     |United Kingdom|
|536365   |84029E   |RED WOOLLY HOTTIE WHITE HEART.     |6       |12/1/2010 8:26|3.39     |17850     |United Kingdom|
|536365   |22752    |SET 7 BABUSHKA NESTING BOXES       

In [24]:
# counting
rtdf.select(fn.count('StockCode')).show() 

+----------------+
|count(StockCode)|
+----------------+
|          541909|
+----------------+



In [19]:
# counting
fn.count('StockCode')

Column<'count(StockCode)'>

In [25]:
# distinct count
rtdf.select(fn.countDistinct('StockCode')).show()

+-------------------------+
|count(DISTINCT StockCode)|
+-------------------------+
|                     4070|
+-------------------------+



In [20]:
# get minimun and maximun
rtdf.select(fn.min("Quantity"), fn.max("Quantity")).explain(extended=True)

== Parsed Logical Plan ==
'Project [min('Quantity) AS min(Quantity)#285, max('Quantity) AS max(Quantity)#286]
+- Relation[InvoiceNo#225,StockCode#226,Description#227,Quantity#228,InvoiceDate#229,UnitPrice#230,CustomerID#231,Country#232] csv

== Analyzed Logical Plan ==
min(Quantity): int, max(Quantity): int
Aggregate [min(Quantity#228) AS min(Quantity)#285, max(Quantity#228) AS max(Quantity)#286]
+- Relation[InvoiceNo#225,StockCode#226,Description#227,Quantity#228,InvoiceDate#229,UnitPrice#230,CustomerID#231,Country#232] csv

== Optimized Logical Plan ==
Aggregate [min(Quantity#228) AS min(Quantity)#285, max(Quantity#228) AS max(Quantity)#286]
+- Project [Quantity#228]
   +- Relation[InvoiceNo#225,StockCode#226,Description#227,Quantity#228,InvoiceDate#229,UnitPrice#230,CustomerID#231,Country#232] csv

== Physical Plan ==
*(2) HashAggregate(keys=[], functions=[min(Quantity#228), max(Quantity#228)], output=[min(Quantity)#285, max(Quantity)#286])
+- Exchange SinglePartition, ENSURE_REQUIR

In [26]:
# get minimun and maximun
rtdf.select(fn.min("Quantity"), fn.max("Quantity")).show()

+-------------+-------------+
|min(Quantity)|max(Quantity)|
+-------------+-------------+
|       -80995|        80995|
+-------------+-------------+



In [27]:
# Variance and Standard Deviation
rtdf.select(fn.var_pop('Quantity'), fn.var_samp('Quantity'),
        fn.stddev_pop('Quantity'), fn.stddev_samp('Quantity')).show()

+-----------------+------------------+--------------------+---------------------+
|var_pop(Quantity)|var_samp(Quantity)|stddev_pop(Quantity)|stddev_samp(Quantity)|
+-----------------+------------------+--------------------+---------------------+
|47559.30364660923| 47559.39140929892|  218.08095663447835|   218.08115785023455|
+-----------------+------------------+--------------------+---------------------+



In [28]:
# use sql
spark.sql('''select count(StockCode) from rtTable''').show()

+----------------+
|count(StockCode)|
+----------------+
|          541909|
+----------------+



In [21]:
# Covariance and Correlation
rtdf.select(fn.corr('InvoiceNo', 'Quantity'), fn.covar_samp('InvoiceNo', 'Quantity'),
        fn.covar_pop('InvoiceNo', 'Quantity')).explain(extended=True)

== Parsed Logical Plan ==
'Project [corr('InvoiceNo, 'Quantity) AS corr(InvoiceNo, Quantity)#336, covar_samp('InvoiceNo, 'Quantity) AS covar_samp(InvoiceNo, Quantity)#345, covar_pop('InvoiceNo, 'Quantity) AS covar_pop(InvoiceNo, Quantity)#354]
+- Relation[InvoiceNo#225,StockCode#226,Description#227,Quantity#228,InvoiceDate#229,UnitPrice#230,CustomerID#231,Country#232] csv

== Analyzed Logical Plan ==
corr(InvoiceNo, Quantity): double, covar_samp(InvoiceNo, Quantity): double, covar_pop(InvoiceNo, Quantity): double
Aggregate [corr(cast(InvoiceNo#225 as double), cast(Quantity#228 as double)) AS corr(InvoiceNo, Quantity)#336, covar_samp(cast(InvoiceNo#225 as double), cast(Quantity#228 as double)) AS covar_samp(InvoiceNo, Quantity)#345, covar_pop(cast(InvoiceNo#225 as double), cast(Quantity#228 as double)) AS covar_pop(InvoiceNo, Quantity)#354]
+- Relation[InvoiceNo#225,StockCode#226,Description#227,Quantity#228,InvoiceDate#229,UnitPrice#230,CustomerID#231,Country#232] csv

== Optimized Log

In [29]:
# Covariance and Correlation
rtdf.select(fn.corr('InvoiceNo', 'Quantity'), fn.covar_samp('InvoiceNo', 'Quantity'),
        fn.covar_pop('InvoiceNo', 'Quantity')).show()

+-------------------------+-------------------------------+------------------------------+
|corr(InvoiceNo, Quantity)|covar_samp(InvoiceNo, Quantity)|covar_pop(InvoiceNo, Quantity)|
+-------------------------+-------------------------------+------------------------------+
|     4.912186085640497E-4|             1052.7280543915997|            1052.7260778754955|
+-------------------------+-------------------------------+------------------------------+



In [30]:
# count with groupby
rtdf.groupBy("InvoiceNo", "CustomerId").count().show(5)

+---------+----------+-----+
|InvoiceNo|CustomerId|count|
+---------+----------+-----+
|   536846|     14573|   76|
|   537026|     12395|   12|
|   537883|     14437|    5|
|   538068|     17978|   12|
|   538279|     14952|    7|
+---------+----------+-----+
only showing top 5 rows



In [31]:
# agg function
rtdf.groupBy('InvoiceNo').agg({'Quantity':'count'}).show(5)

+---------+---------------+
|InvoiceNo|count(Quantity)|
+---------+---------------+
|   536596|              6|
|   536938|             14|
|   537252|              1|
|   537691|             20|
|   538041|              1|
+---------+---------------+
only showing top 5 rows



In [32]:
# agg function
rtdf.groupBy('InvoiceNo').agg(fn.count('Quantity').alias('quan'),
        fn.expr('count(Quantity)')).show(5)

+---------+----+---------------+
|InvoiceNo|quan|count(Quantity)|
+---------+----+---------------+
|   536596|   6|              6|
|   536938|  14|             14|
|   537252|   1|              1|
|   537691|  20|             20|
|   538041|   1|              1|
+---------+----+---------------+
only showing top 5 rows



In [33]:
# agg function
rtdf.groupBy('InvoiceNo').agg({'Quantity':'min', 'UnitPrice':'max'}).show(5)

+---------+--------------+-------------+
|InvoiceNo|max(UnitPrice)|min(Quantity)|
+---------+--------------+-------------+
|   536596|         19.95|            1|
|   536938|         10.95|           20|
|   537252|          0.85|           31|
|   537691|          9.95|            2|
|   538041|           0.0|           30|
+---------+--------------+-------------+
only showing top 5 rows



In [34]:
# agg function
rtdf.groupBy('InvoiceNo').agg(fn.max('Quantity'),
        fn.min('Quantity')).show(5)

+---------+-------------+-------------+
|InvoiceNo|max(Quantity)|min(Quantity)|
+---------+-------------+-------------+
|   536596|            4|            1|
|   536938|           72|           20|
|   537252|           31|           31|
|   537691|           24|            2|
|   538041|           30|           30|
+---------+-------------+-------------+
only showing top 5 rows



## UDF

In [35]:
# UDF
from pyspark.sql.functions import udf

### Traditional Python Function

In [36]:
# normal function 
def price_range(brand):
    if brand in ['Samsung','Apple']:
        return 'High Price'
    elif brand =='MI':
        return 'Mid Price'
    else:
        return 'Low Price'

In [37]:
# create udf using python function
brand_udf=udf(price_range,StringType())

In [22]:
df.printSchema()

root
 |-- ratings: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: double (nullable = true)
 |-- family: integer (nullable = true)
 |-- mobile: string (nullable = true)



In [38]:
# apply udf on dataframe
df.withColumn('price_range',brand_udf(df['mobile'])).show(10,False)

+-------+---+----------+------+-------+-----------+
|ratings|age|experience|family|mobile |price_range|
+-------+---+----------+------+-------+-----------+
|3      |32 |9.0       |3     |Vivo   |Low Price  |
|3      |27 |13.0      |3     |Apple  |High Price |
|4      |22 |2.5       |0     |Samsung|High Price |
|4      |37 |16.5      |4     |Apple  |High Price |
|5      |27 |9.0       |1     |MI     |Mid Price  |
|4      |27 |9.0       |0     |Oppo   |Low Price  |
|5      |37 |23.0      |5     |Vivo   |Low Price  |
|5      |37 |23.0      |5     |Samsung|High Price |
|3      |22 |2.5       |0     |Apple  |High Price |
|3      |27 |6.0       |0     |MI     |Mid Price  |
+-------+---+----------+------+-------+-----------+
only showing top 10 rows



In [39]:
# using lambda function
age_udf = udf(lambda age: "young" if age <= 30 else "senior", StringType())

In [40]:
# apply udf on dataframe
df.withColumn("age_group", age_udf(df.age)).show(10)

+-------+---+----------+------+-------+---------+
|ratings|age|experience|family| mobile|age_group|
+-------+---+----------+------+-------+---------+
|      3| 32|       9.0|     3|   Vivo|   senior|
|      3| 27|      13.0|     3|  Apple|    young|
|      4| 22|       2.5|     0|Samsung|    young|
|      4| 37|      16.5|     4|  Apple|   senior|
|      5| 27|       9.0|     1|     MI|    young|
|      4| 27|       9.0|     0|   Oppo|    young|
|      5| 37|      23.0|     5|   Vivo|   senior|
|      5| 37|      23.0|     5|Samsung|   senior|
|      3| 22|       2.5|     0|  Apple|    young|
|      3| 27|       6.0|     0|     MI|    young|
+-------+---+----------+------+-------+---------+
only showing top 10 rows



### Using Pandas UDF (Spark 2.x)

In [23]:
import pandas as pd
from pyspark.sql.functions import pandas_udf, PandasUDFType # PandasUDFType => Spark 對應到 Pandas 的資料型態

In [24]:
# create python function
# **想要用 pandas 的功能，即定義 pandas 的 function
def remaining_yrs(age):
    yrs_left=100-age
    return yrs_left

In [25]:
# create udf using python function
length_udf = pandas_udf(remaining_yrs, IntegerType())

In [26]:
# apply pandas udf on dataframe
df.withColumn('yrs_left', length_udf(df['age'])).show(5)

+-------+---+----------+------+-------+--------+
|ratings|age|experience|family| mobile|yrs_left|
+-------+---+----------+------+-------+--------+
|      3| 32|       9.0|     3|   Vivo|      68|
|      3| 27|      13.0|     3|  Apple|      73|
|      4| 22|       2.5|     0|Samsung|      78|
|      4| 37|      16.5|     4|  Apple|      63|
|      5| 27|       9.0|     1|     MI|      73|
+-------+---+----------+------+-------+--------+
only showing top 5 rows



In [27]:
# udf using two columns 
def prod(rating,exp):
    x=rating*exp
    return x

In [28]:
# create udf using python function
# DoubleType() => 傳回執資料型態
prod_udf = pandas_udf(prod, DoubleType())

In [29]:
# apply pandas udf on multiple columns of dataframe
df.withColumn("product", prod_udf(df['ratings'],df['experience'])).show(5)

+-------+---+----------+------+-------+-------+
|ratings|age|experience|family| mobile|product|
+-------+---+----------+------+-------+-------+
|      3| 32|       9.0|     3|   Vivo|   27.0|
|      3| 27|      13.0|     3|  Apple|   39.0|
|      4| 22|       2.5|     0|Samsung|   10.0|
|      4| 37|      16.5|     4|  Apple|   66.0|
|      5| 27|       9.0|     1|     MI|   45.0|
+-------+---+----------+------+-------+-------+
only showing top 5 rows



# 加入pandas 的 numpy

In [31]:
import numpy as np

In [32]:
# 直接在 function 加入 pandas 元素，即可動用 pandas
def prod_pandas(rating,exp):
    x=rating*exp*np.pi
    return x

In [33]:
prod_udf = pandas_udf(prod_pandas, DoubleType())

In [34]:
df.withColumn("product", prod_udf(df['ratings'],df['experience'])).show(5)

+-------+---+----------+------+-------+------------------+
|ratings|age|experience|family| mobile|           product|
+-------+---+----------+------+-------+------------------+
|      3| 32|       9.0|     3|   Vivo| 84.82300164692441|
|      3| 27|      13.0|     3|  Apple|122.52211349000193|
|      4| 22|       2.5|     0|Samsung| 31.41592653589793|
|      4| 37|      16.5|     4|  Apple|207.34511513692635|
|      5| 27|       9.0|     1|     MI| 141.3716694115407|
+-------+---+----------+------+-------+------------------+
only showing top 5 rows



In [35]:
# use decorator
@pandas_udf(IntegerType())
def remaining_yrs2(age):
    yrs_left=100-age
    return yrs_left

In [36]:
# apply pandas udf on dataframe
df.withColumn('yrs_left', remaining_yrs2(df['age'])).show(5)

+-------+---+----------+------+-------+--------+
|ratings|age|experience|family| mobile|yrs_left|
+-------+---+----------+------+-------+--------+
|      3| 32|       9.0|     3|   Vivo|      68|
|      3| 27|      13.0|     3|  Apple|      73|
|      4| 22|       2.5|     0|Samsung|      78|
|      4| 37|      16.5|     4|  Apple|      63|
|      5| 27|       9.0|     1|     MI|      73|
+-------+---+----------+------+-------+--------+
only showing top 5 rows



In [50]:
# use decorator
@pandas_udf(DoubleType())
def prod2(rating,exp):
    x=rating*exp
    return x

In [51]:
# apply pandas udf on multiple columns of dataframe
df.withColumn("product", prod2(df['ratings'],df['experience'])).show(5)

+-------+---+----------+------+-------+-------+
|ratings|age|experience|family| mobile|product|
+-------+---+----------+------+-------+-------+
|      3| 32|       9.0|     3|   Vivo|   27.0|
|      3| 27|      13.0|     3|  Apple|   39.0|
|      4| 22|       2.5|     0|Samsung|   10.0|
|      4| 37|      16.5|     4|  Apple|   66.0|
|      5| 27|       9.0|     1|     MI|   45.0|
+-------+---+----------+------+-------+-------+
only showing top 5 rows



### Using Pandas UDF (Spark 3.x)

In [44]:
# create pandas udf function
@pandas_udf('int')
def remaining_yrs3(age: pd.Series) -> pd.Series:
    yrs_left=100-age
    return yrs_left

In [45]:
# apply pandas udf on dataframe
df.withColumn('yrs_left', remaining_yrs3(df['age'])).show(5)

+-------+---+----------+------+-------+--------+
|ratings|age|experience|family| mobile|yrs_left|
+-------+---+----------+------+-------+--------+
|      3| 32|       9.0|     3|   Vivo|      68|
|      3| 27|      13.0|     3|  Apple|      73|
|      4| 22|       2.5|     0|Samsung|      78|
|      4| 37|      16.5|     4|  Apple|      63|
|      5| 27|       9.0|     1|     MI|      73|
+-------+---+----------+------+-------+--------+
only showing top 5 rows



In [41]:
# create pandas udf function
@pandas_udf('double')
def prod3(rating: pd.Series, exp: pd.Series) -> pd.Series:
    x=rating*exp
    return x

In [42]:
# apply pandas udf on multiple columns of dataframe
df.withColumn("product", prod3(df['ratings'],df['experience'])).show(5)

+-------+---+----------+------+-------+-------+
|ratings|age|experience|family| mobile|product|
+-------+---+----------+------+-------+-------+
|      3| 32|       9.0|     3|   Vivo|   27.0|
|      3| 27|      13.0|     3|  Apple|   39.0|
|      4| 22|       2.5|     0|Samsung|   10.0|
|      4| 37|      16.5|     4|  Apple|   66.0|
|      5| 27|       9.0|     1|     MI|   45.0|
+-------+---+----------+------+-------+-------+
only showing top 5 rows



In [43]:
# use spark sql 
df.selectExpr('*', 'experience*ratings as product').show(5)

+-------+---+----------+------+-------+-------+
|ratings|age|experience|family| mobile|product|
+-------+---+----------+------+-------+-------+
|      3| 32|       9.0|     3|   Vivo|   27.0|
|      3| 27|      13.0|     3|  Apple|   39.0|
|      4| 22|       2.5|     0|Samsung|   10.0|
|      4| 37|      16.5|     4|  Apple|   66.0|
|      5| 27|       9.0|     1|     MI|   45.0|
+-------+---+----------+------+-------+-------+
only showing top 5 rows



# Compare

In [38]:
df.withColumn('yrs_left', length_udf(df['age'])).explain(extended=True)

== Parsed Logical Plan ==
Project [ratings#16, age#17, experience#18, family#19, mobile#20, remaining_yrs(age#17) AS yrs_left#636]
+- Relation[ratings#16,age#17,experience#18,family#19,mobile#20] csv

== Analyzed Logical Plan ==
ratings: int, age: int, experience: double, family: int, mobile: string, yrs_left: int
Project [ratings#16, age#17, experience#18, family#19, mobile#20, remaining_yrs(age#17) AS yrs_left#636]
+- Relation[ratings#16,age#17,experience#18,family#19,mobile#20] csv

== Optimized Logical Plan ==
Project [ratings#16, age#17, experience#18, family#19, mobile#20, pythonUDF0#643 AS yrs_left#636]
+- ArrowEvalPython [remaining_yrs(age#17)], [pythonUDF0#643], 200
   +- Relation[ratings#16,age#17,experience#18,family#19,mobile#20] csv

== Physical Plan ==
*(1) Project [ratings#16, age#17, experience#18, family#19, mobile#20, pythonUDF0#643 AS yrs_left#636]
+- ArrowEvalPython [remaining_yrs(age#17)], [pythonUDF0#643], 200
   +- FileScan csv [ratings#16,age#17,experience#18,fa

In [37]:
df.withColumn('yrs_left', remaining_yrs2(df['age'])).explain(extended=True)

== Parsed Logical Plan ==
Project [ratings#16, age#17, experience#18, family#19, mobile#20, remaining_yrs2(age#17) AS yrs_left#627]
+- Relation[ratings#16,age#17,experience#18,family#19,mobile#20] csv

== Analyzed Logical Plan ==
ratings: int, age: int, experience: double, family: int, mobile: string, yrs_left: int
Project [ratings#16, age#17, experience#18, family#19, mobile#20, remaining_yrs2(age#17) AS yrs_left#627]
+- Relation[ratings#16,age#17,experience#18,family#19,mobile#20] csv

== Optimized Logical Plan ==
Project [ratings#16, age#17, experience#18, family#19, mobile#20, pythonUDF0#634 AS yrs_left#627]
+- ArrowEvalPython [remaining_yrs2(age#17)], [pythonUDF0#634], 200
   +- Relation[ratings#16,age#17,experience#18,family#19,mobile#20] csv

== Physical Plan ==
*(1) Project [ratings#16, age#17, experience#18, family#19, mobile#20, pythonUDF0#634 AS yrs_left#627]
+- ArrowEvalPython [remaining_yrs2(age#17)], [pythonUDF0#634], 200
   +- FileScan csv [ratings#16,age#17,experience#1

In [46]:
# apply pandas udf on dataframe
df.withColumn('yrs_left', remaining_yrs3(df['age'])).explain(extended=True)

== Parsed Logical Plan ==
Project [ratings#16, age#17, experience#18, family#19, mobile#20, remaining_yrs3(age#17) AS yrs_left#799]
+- Relation[ratings#16,age#17,experience#18,family#19,mobile#20] csv

== Analyzed Logical Plan ==
ratings: int, age: int, experience: double, family: int, mobile: string, yrs_left: int
Project [ratings#16, age#17, experience#18, family#19, mobile#20, remaining_yrs3(age#17) AS yrs_left#799]
+- Relation[ratings#16,age#17,experience#18,family#19,mobile#20] csv

== Optimized Logical Plan ==
Project [ratings#16, age#17, experience#18, family#19, mobile#20, pythonUDF0#806 AS yrs_left#799]
+- ArrowEvalPython [remaining_yrs3(age#17)], [pythonUDF0#806], 200
   +- Relation[ratings#16,age#17,experience#18,family#19,mobile#20] csv

== Physical Plan ==
*(1) Project [ratings#16, age#17, experience#18, family#19, mobile#20, pythonUDF0#806 AS yrs_left#799]
+- ArrowEvalPython [remaining_yrs3(age#17)], [pythonUDF0#806], 200
   +- FileScan csv [ratings#16,age#17,experience#1

# Compare

In [47]:
# apply pandas udf on multiple columns of dataframe
df.withColumn("product", prod3(df['ratings'],df['experience'])).explain(extended=True)

== Parsed Logical Plan ==
Project [ratings#16, age#17, experience#18, family#19, mobile#20, prod3(ratings#16, experience#18) AS product#808]
+- Relation[ratings#16,age#17,experience#18,family#19,mobile#20] csv

== Analyzed Logical Plan ==
ratings: int, age: int, experience: double, family: int, mobile: string, product: double
Project [ratings#16, age#17, experience#18, family#19, mobile#20, prod3(ratings#16, experience#18) AS product#808]
+- Relation[ratings#16,age#17,experience#18,family#19,mobile#20] csv

== Optimized Logical Plan ==
Project [ratings#16, age#17, experience#18, family#19, mobile#20, pythonUDF0#815 AS product#808]
+- ArrowEvalPython [prod3(ratings#16, experience#18)], [pythonUDF0#815], 200
   +- Relation[ratings#16,age#17,experience#18,family#19,mobile#20] csv

== Physical Plan ==
*(1) Project [ratings#16, age#17, experience#18, family#19, mobile#20, pythonUDF0#815 AS product#808]
+- ArrowEvalPython [prod3(ratings#16, experience#18)], [pythonUDF0#815], 200
   +- FileSc

In [49]:
# use spark sql 
df.selectExpr('*', 'experience*ratings as product').show(5)

+-------+---+----------+------+-------+-------+
|ratings|age|experience|family| mobile|product|
+-------+---+----------+------+-------+-------+
|      3| 32|       9.0|     3|   Vivo|   27.0|
|      3| 27|      13.0|     3|  Apple|   39.0|
|      4| 22|       2.5|     0|Samsung|   10.0|
|      4| 37|      16.5|     4|  Apple|   66.0|
|      5| 27|       9.0|     1|     MI|   45.0|
+-------+---+----------+------+-------+-------+
only showing top 5 rows



In [48]:
# use spark sql 
df.selectExpr('*', 'experience*ratings as product').explain(extended=True)

== Parsed Logical Plan ==
'Project [*, ('experience * 'ratings) AS product#816]
+- Relation[ratings#16,age#17,experience#18,family#19,mobile#20] csv

== Analyzed Logical Plan ==
ratings: int, age: int, experience: double, family: int, mobile: string, product: double
Project [ratings#16, age#17, experience#18, family#19, mobile#20, (experience#18 * cast(ratings#16 as double)) AS product#816]
+- Relation[ratings#16,age#17,experience#18,family#19,mobile#20] csv

== Optimized Logical Plan ==
Project [ratings#16, age#17, experience#18, family#19, mobile#20, (experience#18 * cast(ratings#16 as double)) AS product#816]
+- Relation[ratings#16,age#17,experience#18,family#19,mobile#20] csv

== Physical Plan ==
*(1) Project [ratings#16, age#17, experience#18, family#19, mobile#20, (experience#18 * cast(ratings#16 as double)) AS product#816]
+- FileScan csv [ratings#16,age#17,experience#18,family#19,mobile#20] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[file:/home/had

## Saving file

In [57]:
# save the dataframe as single csv 
# mode='overwirte' 存在覆寫
df.coalesce(1).write.csv('data/df_data.csv', header='True', mode='overwrite')

In [58]:
# save the data into parquet format 
rtdf.write.parquet('data/retail_dataset_parquet', mode='overwrite')

In [59]:
# read the data from parquet format 
rtdf2=spark.read.parquet('data/retail_dataset_parquet')

In [60]:
rtdf2.show(10)

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   549261|    21181|PLEASE ONE PERSON...|       2|4/7/2011 12:35|      2.1|     14903|United Kingdom|
|   549261|    21903|  MAN FLU METAL SIGN|       2|4/7/2011 12:35|      2.1|     14903|United Kingdom|
|   549261|    85150|LADIES & GENTLEME...|       4|4/7/2011 12:35|     2.55|     14903|United Kingdom|
|   549261|    21908|CHOCOLATE THIS WA...|       2|4/7/2011 12:35|      2.1|     14903|United Kingdom|
|   549261|    22670|FRENCH WC SIGN BL...|       3|4/7/2011 12:35|     1.25|     14903|United Kingdom|
|   549261|    22197|SMALL POPCORN HOLDER|       7|4/7/2011 12:35|     0.85|     14903|United Kingdom|
|   549261|    82581|   TOILET METAL SIGN|       4|4/7/2011 12:35|     0.

In [57]:
spark.read.jdbc?

[0;31mSignature:[0m
[0mspark[0m[0;34m.[0m[0mread[0m[0;34m.[0m[0mjdbc[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0murl[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtable[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcolumn[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlowerBound[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mupperBound[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnumPartitions[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpredicates[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mproperties[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Construct a :class:`DataFrame` representing the database table named ``table``
accessible via JDBC URL ``url`` and connection ``properties``.

Partitions of the table will be retrieved in parallel if either ``column`

## Outlier

In [50]:
import numpy as np

In [51]:
wdf = spark.read.csv('data/winequality_white.csv',sep=';',inferSchema=True,header=True)

In [63]:
# columns of dataframe
wdf.columns

['fixed_acidity',
 'volatile_acidity',
 'citric_acid',
 'residual_sugar',
 'chlorides',
 'free_sulfur_ dioxide',
 'total_sulfur_dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'quality']

In [64]:
# shape of dataset
wdf.count(),len(wdf.columns)

(4898, 12)

In [65]:
# print dataframe schema
wdf.printSchema()

root
 |-- fixed_acidity: double (nullable = true)
 |-- volatile_acidity: double (nullable = true)
 |-- citric_acid: double (nullable = true)
 |-- residual_sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free_sulfur_ dioxide: double (nullable = true)
 |-- total_sulfur_dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: integer (nullable = true)



In [66]:
# display fisrt few rows of dataframe
#wdf.show()
wdf.show(10)

+-------------+----------------+-----------+--------------+---------+--------------------+--------------------+-------+----+---------+-------+-------+
|fixed_acidity|volatile_acidity|citric_acid|residual_sugar|chlorides|free_sulfur_ dioxide|total_sulfur_dioxide|density|  pH|sulphates|alcohol|quality|
+-------------+----------------+-----------+--------------+---------+--------------------+--------------------+-------+----+---------+-------+-------+
|          7.0|            0.27|       0.36|          20.7|    0.045|                45.0|               170.0|  1.001| 3.0|     0.45|    8.8|      6|
|          6.3|             0.3|       0.34|           1.6|    0.049|                14.0|               132.0|  0.994| 3.3|     0.49|    9.5|      6|
|          8.1|            0.28|        0.4|           6.9|     0.05|                30.0|                97.0| 0.9951|3.26|     0.44|   10.1|      6|
|          7.2|            0.23|       0.32|           8.5|    0.058|                47.0|    

In [67]:
wdf.select('pH','sulphates','chlorides').summary().show()

+-------+-------------------+-------------------+--------------------+
|summary|                 pH|          sulphates|           chlorides|
+-------+-------------------+-------------------+--------------------+
|  count|               4898|               4898|                4898|
|   mean| 3.1882666394446693| 0.4898468762760325|  0.0457723560636995|
| stddev|0.15100059961506673|0.11412583394883222|0.021847968093728805|
|    min|               2.72|               0.22|               0.009|
|    25%|               3.09|               0.41|               0.036|
|    50%|               3.18|               0.47|               0.043|
|    75%|               3.28|               0.55|                0.05|
|    max|               3.82|               1.08|               0.346|
+-------+-------------------+-------------------+--------------------+



In [52]:
# create pandas udf function
# to detect outlier
@pandas_udf('int')
def outliers_iqr(val: pd.Series) -> pd.Series:
    quartile_1, quartile_3 = np.percentile(val, [25, 75])
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr * 1.5)
    upper_bound = quartile_3 + (iqr * 1.5)
    
    return pd.Series(np.where((val > upper_bound) | (val < lower_bound),1,0))

In [53]:
wdf2 = wdf.withColumn('pH_out', outliers_iqr(wdf['pH']))

In [54]:
# cache =>當資料顯示不出時用
wdf2.cache()
wdf2.createOrReplaceTempView("wdf2Table")

In [56]:
wdf2.cache?

[0;31mSignature:[0m [0mwdf2[0m[0;34m.[0m[0mcache[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Persists the :class:`DataFrame` with the default storage level (`MEMORY_AND_DISK`).

.. versionadded:: 1.3.0

Notes
-----
The default storage level has changed to `MEMORY_AND_DISK` to match Scala in 2.0.
[0;31mFile:[0m      /usr/local/spark/python/pyspark/sql/dataframe.py
[0;31mType:[0m      method


In [71]:
wdf2.printSchema()

root
 |-- fixed_acidity: double (nullable = true)
 |-- volatile_acidity: double (nullable = true)
 |-- citric_acid: double (nullable = true)
 |-- residual_sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free_sulfur_ dioxide: double (nullable = true)
 |-- total_sulfur_dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: integer (nullable = true)
 |-- pH_out: integer (nullable = true)



In [72]:
wdf2.select('pH','pH_out').filter('pH_out==1').show(10)

+----+------+
|  pH|pH_out|
+----+------+
|3.69|     1|
|3.63|     1|
|3.72|     1|
|3.61|     1|
|3.64|     1|
|3.64|     1|
|3.72|     1|
|3.72|     1|
|3.58|     1|
|3.58|     1|
+----+------+
only showing top 10 rows



In [73]:
spark.sql('''select pH,pH_out from wdf2Table where pH_out=1 limit 10''').show()

+----+------+
|  pH|pH_out|
+----+------+
|3.69|     1|
|3.63|     1|
|3.72|     1|
|3.61|     1|
|3.64|     1|
|3.64|     1|
|3.72|     1|
|3.72|     1|
|3.58|     1|
|3.58|     1|
+----+------+



In [74]:
# 存檔前要用 cache => 不然會沒辦法存
# save the data into parquet format 
wdf2.write.csv('data/wdf2', header='True', mode='overwrite')

In [75]:
# Load csv Dataset
wdf3=spark.read.csv('data/wdf2',inferSchema=True,header=True)
wdf3.createOrReplaceTempView("wdf4Table")

In [76]:
wdf3.filter('pH_out==1').show(15)

+-------------+----------------+-----------+--------------+---------+--------------------+--------------------+-------+----+---------+-------+-------+------+
|fixed_acidity|volatile_acidity|citric_acid|residual_sugar|chlorides|free_sulfur_ dioxide|total_sulfur_dioxide|density|  pH|sulphates|alcohol|quality|pH_out|
+-------------+----------------+-----------+--------------+---------+--------------------+--------------------+-------+----+---------+-------+-------+------+
|          6.0|            0.27|       0.28|           4.8|    0.063|                31.0|               201.0| 0.9964|3.69|     0.71|   10.0|      5|     1|
|          5.5|           0.485|        0.0|           1.5|    0.065|                 8.0|               103.0|  0.994|3.63|      0.4|    9.7|      4|     1|
|          5.9|            0.21|       0.28|           4.6|    0.053|                40.0|               199.0| 0.9964|3.72|      0.7|   10.0|      4|     1|
|          6.0|             0.1|       0.24|        