In [1]:
# !pip install pyspark

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [3]:
print(spark.version)

3.5.1


### Creating DataFrame

In [4]:
# # Creating df from a list of rows

# from datetime import datetime, date
# from pyspark.sql import Row

# df = spark.createDataFrame([
#     Row(a = 1, b = 2., c='string1', d = date(2000, 1, 1)),
#     Row(a = 2, b = 3., c='string2', d = date(2000, 2, 1)),
# ])
# df

In [5]:
# # Creating df with an explicit schema

# df = spark.createDataFrame([
#     (1, 2., 'string1', date(2000, 1, 1), datetime(2000, 1, 1, 12, 0)),
#     (2, 3., 'string2', date(2000, 2, 1), datetime(2000, 1, 2, 12, 0)),
#     (3, 4., 'string3', date(2000, 3, 1), datetime(2000, 1, 3, 12, 0))
# ], schema='a long, b double, c string, d date, e timestamp')
# df

In [6]:
# Creating spark dataframe from a pandas dataframe

# import pandas as pd
# from datetime import datetime, date

# pandas_df = pd.DataFrame({
#     'a': [1, 2, 3],
#     'b': [2., 3., 4.],
#     'c': ['string1', 'string2', 'string3'],
#     'd': [date(2000, 1, 1), date(2000, 2, 1), date(2000, 3, 1)],
#     'e': [datetime(2000, 1, 1, 12, 0), datetime(2000, 1, 2, 12, 0), datetime(2000, 1, 3, 12, 0)]
# })
# df = spark.createDataFrame(pandas_df)
# type(df)

In [7]:
# Creating pandas-on-Spark Dataframe

# import pandas as pd
# import numpy as np
# import pyspark.pandas as ps


# pdf = pd.DataFrame(np.random.randn(6, 4), columns=list('ABCD'))
# psdf = ps.from_pandas(pdf)
# type(psdf)

### Geting Data In/Out

In [8]:
df = spark.read.csv('OnlineRetail.csv', header = True)

# df.write.csv('OnlineRetail.csv', header=True)

In [9]:
type(df)

pyspark.sql.dataframe.DataFrame

### Viewing Data

In [10]:
df.show(5)

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
only showing top 5 rows



In [11]:
# spark.conf.set('spark.sql.repl.eagerEval.enabled', True)
# df

In [12]:
df.columns

['InvoiceNo',
 'StockCode',
 'Description',
 'Quantity',
 'InvoiceDate',
 'UnitPrice',
 'CustomerID',
 'Country']

In [13]:
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: string (nullable = true)
 |-- CustomerID: string (nullable = true)
 |-- Country: string (nullable = true)



In [14]:
# Summary of DataFrame

df.select("*").describe().show()

+-------+------------------+------------------+--------------------+------------------+---------------+------------------+------------------+-----------+
|summary|         InvoiceNo|         StockCode|         Description|          Quantity|    InvoiceDate|         UnitPrice|        CustomerID|    Country|
+-------+------------------+------------------+--------------------+------------------+---------------+------------------+------------------+-----------+
|  count|            541909|            541909|              540455|            541909|         541909|            541909|            406829|     541909|
|   mean|  559965.752026781|27623.240210938104|             20713.0|  9.55224954743324|           NULL|4.6111136260897085|15287.690570239585|       NULL|
| stddev|13428.417280796779|16799.737628427658|                NULL|218.08115785023438|           NULL| 96.75985306117963|1713.6003033215982|       NULL|
|    min|            536365|             10002| 4 PURPLE FLOCK D...|        

In [15]:
# Conversion to Pandas DataFrame

# df.toPandas().head()

### Selecting and Accessing Data

In [16]:
# PySpark DataFrame is lazily evaluated and simply selecting a column does not 
# trigger the computation but it returns a Column instance.
df.InvoiceNo

Column<'InvoiceNo'>

In [17]:
#  DataFrame.select() takes the Column instances that returns another DataFrame.

df.select(df.InvoiceNo).show(5)

+---------+
|InvoiceNo|
+---------+
|   536365|
|   536365|
|   536365|
|   536365|
|   536365|
+---------+
only showing top 5 rows



In [19]:
# To assign new column instance

from pyspark.sql.functions import upper
df.withColumn('Upper', upper(df.Country)).show(5)

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|         Upper|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|UNITED KINGDOM|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|UNITED KINGDOM|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|UNITED KINGDOM|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|UNITED KINGDOM|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|UNITED KINGDOM|
+---------+---------+--------------------+--------+-----

In [20]:
# To select a subset of rows

df.filter(df.Country == 'Germany').show(5)

+---------+---------+--------------------+--------+---------------+---------+----------+-------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|Country|
+---------+---------+--------------------+--------+---------------+---------+----------+-------+
|   536527|    22809|SET OF 6 T-LIGHTS...|       6|12/1/2010 13:04|     2.95|     12662|Germany|
|   536527|    84347|ROTATING SILVER A...|       6|12/1/2010 13:04|     2.55|     12662|Germany|
|   536527|    84945|MULTI COLOUR SILV...|      12|12/1/2010 13:04|     0.85|     12662|Germany|
|   536527|    22242|5 HOOK HANGER MAG...|      12|12/1/2010 13:04|     1.65|     12662|Germany|
|   536527|    22244|3 HOOK HANGER MAG...|      12|12/1/2010 13:04|     1.95|     12662|Germany|
+---------+---------+--------------------+--------+---------------+---------+----------+-------+
only showing top 5 rows



### Changing Datatype


In [21]:
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: string (nullable = true)
 |-- CustomerID: string (nullable = true)
 |-- Country: string (nullable = true)



In [22]:

from pyspark.sql.functions import col
df = df.withColumn("Quantity", col("Quantity").cast("integer"))
df = df.withColumn("UnitPrice", col("UnitPrice").cast("integer"))

### Grouping Data

In [23]:
# Groupby aggregation works on numeric columns only
df.groupby('CustomerID').avg().show()

+----------+------------------+------------------+
|CustomerID|     avg(Quantity)|    avg(UnitPrice)|
+----------+------------------+------------------+
|     16250| 8.666666666666666|             2.375|
|     15574|2.0773809523809526|               2.0|
|     15555| 4.767567567567568| 1.172972972972973|
|     15271| 4.570909090909091|              2.12|
|     17714|               9.2|               1.6|
|     17757|  4.46900269541779|2.1361185983827493|
|     17551| 3.488372093023256|1.9767441860465116|
|     13187|2.5675675675675675| 3.189189189189189|
|     16549|2.4108053007135575|1.5168195718654434|
|     12637| 7.588832487309645|2.0913705583756346|
|     15052|2.7333333333333334|2.8666666666666667|
|     14525|  8.93288590604027| 2.661073825503356|
|     18283| 1.847883597883598|1.0833333333333333|
|     13107|21.466666666666665|1.2833333333333334|
|     16303|15.694610778443113|2.2335329341317367|
|     13174| 4.442675159235669| 2.767515923566879|
|     13027| 664.6153846153846|

### Working with SQL

DataFrame and Spark SQL share the same execution engine so they can be interchangeably used seamlessly. 

In [24]:
df.createOrReplaceTempView("tableA")
spark.sql("SELECT count(*) from tableA").show()

+--------+
|count(1)|
+--------+
|  541909|
+--------+

