### Import SparkSession

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

### Creating DataFrame

 *Note:- Downgrade to Python 3.11.8 or lower to resolve Py4J error* \
 *Encounterd lots of Py4J error in Python 3.12.3 such as crashed, cannot find Python3, etc.*

In [2]:
# Creating df from a list of rows

from datetime import datetime, date
from pyspark.sql import Row

df = spark.createDataFrame([
    Row(a = 1, b = 2., c='string1', d = date(2000, 1, 1)),
    Row(a = 2, b = 3., c='string2', d = date(2000, 2, 1)),
])
df.show()

+---+---+-------+----------+
|  a|  b|      c|         d|
+---+---+-------+----------+
|  1|2.0|string1|2000-01-01|
|  2|3.0|string2|2000-02-01|
+---+---+-------+----------+



In [3]:
# Creating df with an explicit schema

df = spark.createDataFrame([
    (1, 2., 'string1', date(2000, 1, 1), datetime(2000, 1, 1, 12, 0)),
    (2, 3., 'string2', date(2000, 2, 1), datetime(2000, 1, 2, 12, 0)),
    (3, 4., 'string3', date(2000, 3, 1), datetime(2000, 1, 3, 12, 0))
], schema='a long, b double, c string, d date, e timestamp')
df.show()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
|  2|3.0|string2|2000-02-01|2000-01-02 12:00:00|
|  3|4.0|string3|2000-03-01|2000-01-03 12:00:00|
+---+---+-------+----------+-------------------+



In [3]:
# Creating spark dataframe from a pandas dataframe

import pandas as pd
from datetime import datetime, date

pandas_df = pd.DataFrame({
    'a': [1, 2, 3],
    'b': [2., 3., 4.],
    'c': ['string1', 'string2', 'string3'],
    'd': [date(2000, 1, 1), date(2000, 2, 1), date(2000, 3, 1)],
    'e': [datetime(2000, 1, 1, 12, 0), datetime(2000, 1, 2, 12, 0), datetime(2000, 1, 3, 12, 0)]
})
df = spark.createDataFrame(pandas_df)
print(type(df))
df.show()

<class 'pyspark.sql.dataframe.DataFrame'>
+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
|  2|3.0|string2|2000-02-01|2000-01-02 12:00:00|
|  3|4.0|string3|2000-03-01|2000-01-03 12:00:00|
+---+---+-------+----------+-------------------+



In [None]:
# Spark dataframe to pandas df
pandas_df = df.toPandas()
pandas_df

In [6]:
# Creating pandas-on-Spark Dataframe

import pandas as pd
import numpy as np
import pyspark.pandas as ps


pdf = pd.DataFrame(np.random.randn(6, 4), columns=list('ABCD'))
psdf = ps.from_pandas(pdf)
print(type(psdf))
psdf.head()

<class 'pyspark.pandas.frame.DataFrame'>


Unnamed: 0,A,B,C,D
0,0.237391,-0.200755,1.792563,-1.477298
1,-1.78731,0.596215,-0.023786,0.497332
2,1.310702,-0.436939,-0.053464,-0.738833
3,-0.539894,1.161481,-1.928678,-0.162349
4,1.708747,0.388588,-0.659004,-0.869576


### Geting Data In/Out

In [2]:
df = spark.read.csv('./dataset/OnlineRetail.csv', header = True)

# df.write.csv('OnlineRetail.csv', header=True)

In [3]:
type(df)

pyspark.sql.dataframe.DataFrame

### Viewing Data

In [4]:
df.show(5)

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
only showing top 5 rows



In [6]:
df.tail(5)

[Row(InvoiceNo='581587', StockCode='22613', Description='PACK OF 20 SPACEBOY NAPKINS', Quantity='12', InvoiceDate='12/9/2011 12:50', UnitPrice='0.85', CustomerID='12680', Country='France'),
 Row(InvoiceNo='581587', StockCode='22899', Description="CHILDREN'S APRON DOLLY GIRL ", Quantity='6', InvoiceDate='12/9/2011 12:50', UnitPrice='2.1', CustomerID='12680', Country='France'),
 Row(InvoiceNo='581587', StockCode='23254', Description='CHILDRENS CUTLERY DOLLY GIRL ', Quantity='4', InvoiceDate='12/9/2011 12:50', UnitPrice='4.15', CustomerID='12680', Country='France'),
 Row(InvoiceNo='581587', StockCode='23255', Description='CHILDRENS CUTLERY CIRCUS PARADE', Quantity='4', InvoiceDate='12/9/2011 12:50', UnitPrice='4.15', CustomerID='12680', Country='France'),
 Row(InvoiceNo='581587', StockCode='22138', Description='BAKING SET 9 PIECE RETROSPOT ', Quantity='3', InvoiceDate='12/9/2011 12:50', UnitPrice='4.95', CustomerID='12680', Country='France')]

In [10]:
df.count()

541909

In [8]:
# spark.conf.set('spark.sql.repl.eagerEval.enabled', True)
# df

In [11]:
df.columns

['InvoiceNo',
 'StockCode',
 'Description',
 'Quantity',
 'InvoiceDate',
 'UnitPrice',
 'CustomerID',
 'Country']

In [9]:
df.dtypes

[('InvoiceNo', 'string'),
 ('StockCode', 'string'),
 ('Description', 'string'),
 ('Quantity', 'string'),
 ('InvoiceDate', 'string'),
 ('UnitPrice', 'string'),
 ('CustomerID', 'string'),
 ('Country', 'string')]

In [12]:
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: string (nullable = true)
 |-- CustomerID: string (nullable = true)
 |-- Country: string (nullable = true)



In [7]:
df.schema

StructType([StructField('InvoiceNo', StringType(), True), StructField('StockCode', StringType(), True), StructField('Description', StringType(), True), StructField('Quantity', StringType(), True), StructField('InvoiceDate', StringType(), True), StructField('UnitPrice', StringType(), True), StructField('CustomerID', StringType(), True), StructField('Country', StringType(), True)])

In [13]:
# Summary of DataFrame

df.select("*").describe().show()

+-------+------------------+------------------+--------------------+------------------+---------------+------------------+------------------+-----------+
|summary|         InvoiceNo|         StockCode|         Description|          Quantity|    InvoiceDate|         UnitPrice|        CustomerID|    Country|
+-------+------------------+------------------+--------------------+------------------+---------------+------------------+------------------+-----------+
|  count|            541909|            541909|              540455|            541909|         541909|            541909|            406829|     541909|
|   mean|  559965.752026781|27623.240210938104|             20713.0|  9.55224954743324|           NULL|4.6111136260897085|15287.690570239585|       NULL|
| stddev|13428.417280796779|16799.737628427658|                NULL|218.08115785023438|           NULL| 96.75985306117963|1713.6003033215982|       NULL|
|    min|            536365|             10002| 4 PURPLE FLOCK D...|        

In [14]:
# Conversion to Pandas DataFrame

df.toPandas().head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850,United Kingdom


### Selecting and Accessing Data

In [15]:
# PySpark DataFrame is lazily evaluated and simply selecting a column does not 
# trigger the computation but it returns a Column instance.
df.InvoiceNo

Column<'InvoiceNo'>

In [16]:
#  DataFrame.select() takes the Column instances that returns another DataFrame.

df.select(df.InvoiceNo).show(5)
# df.select('InvoiceNo').show(5)

+---------+
|InvoiceNo|
+---------+
|   536365|
|   536365|
|   536365|
|   536365|
|   536365|
+---------+
only showing top 5 rows



In [17]:
# To assign new column instance

from pyspark.sql.functions import upper
df.withColumn('Upper', upper(df.Country)).show(5)

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|         Upper|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|UNITED KINGDOM|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|UNITED KINGDOM|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|UNITED KINGDOM|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|UNITED KINGDOM|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|UNITED KINGDOM|
+---------+---------+--------------------+--------+-----

In [18]:
# To select a subset of rows # Filter on ==, >, <, >=, <= condition

df.filter(df.Country == 'Germany').show(5)

+---------+---------+--------------------+--------+---------------+---------+----------+-------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|Country|
+---------+---------+--------------------+--------+---------------+---------+----------+-------+
|   536527|    22809|SET OF 6 T-LIGHTS...|       6|12/1/2010 13:04|     2.95|     12662|Germany|
|   536527|    84347|ROTATING SILVER A...|       6|12/1/2010 13:04|     2.55|     12662|Germany|
|   536527|    84945|MULTI COLOUR SILV...|      12|12/1/2010 13:04|     0.85|     12662|Germany|
|   536527|    22242|5 HOOK HANGER MAG...|      12|12/1/2010 13:04|     1.65|     12662|Germany|
|   536527|    22244|3 HOOK HANGER MAG...|      12|12/1/2010 13:04|     1.95|     12662|Germany|
+---------+---------+--------------------+--------+---------------+---------+----------+-------+
only showing top 5 rows



In [12]:
# Multiple conditions require parentheses around each condition
df.filter((df.Country == 'Germany') & (df.CustomerID == 12662)).show(5)

+---------+---------+--------------------+--------+---------------+---------+----------+-------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|Country|
+---------+---------+--------------------+--------+---------------+---------+----------+-------+
|   536527|    22809|SET OF 6 T-LIGHTS...|       6|12/1/2010 13:04|     2.95|     12662|Germany|
|   536527|    84347|ROTATING SILVER A...|       6|12/1/2010 13:04|     2.55|     12662|Germany|
|   536527|    84945|MULTI COLOUR SILV...|      12|12/1/2010 13:04|     0.85|     12662|Germany|
|   536527|    22242|5 HOOK HANGER MAG...|      12|12/1/2010 13:04|     1.65|     12662|Germany|
|   536527|    22244|3 HOOK HANGER MAG...|      12|12/1/2010 13:04|     1.95|     12662|Germany|
+---------+---------+--------------------+--------+---------------+---------+----------+-------+
only showing top 5 rows



In [18]:
# Sort results
# df = df.orderBy(df.CustomerID.asc())
df = df.orderBy(df.CustomerID.desc())
df.show(5)

+---------+---------+--------------------+--------+----------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|     InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+----------------+---------+----------+--------------+
|   570715|    22865|HAND WARMER OWL D...|      12|10/12/2011 10:23|      2.1|     18287|United Kingdom|
|   554065|    22755|SMALL PURPLE BABU...|      12| 5/22/2011 10:39|     0.85|     18287|United Kingdom|
|   570715|    22600|CHRISTMAS RETROSP...|      24|10/12/2011 10:23|     0.85|     18287|United Kingdom|
|   554065|    22757|LARGE RED BABUSHK...|      12| 5/22/2011 10:39|     1.25|     18287|United Kingdom|
|   570715|    23264|SET OF 3 WOODEN S...|      12|10/12/2011 10:23|     1.25|     18287|United Kingdom|
+---------+---------+--------------------+--------+----------------+---------+----------+--------------+
only showing top 5 rows



### Changing Datatype


In [19]:
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: string (nullable = true)
 |-- CustomerID: string (nullable = true)
 |-- Country: string (nullable = true)



In [20]:

from pyspark.sql.functions import col
df = df.withColumn("Quantity", col("Quantity").cast("integer"))
df = df.withColumn("UnitPrice", col("UnitPrice").cast("integer"))

### Grouping Data

In [21]:
# Groupby aggregation works on numeric columns only
df.groupby('CustomerID').avg().show()

+----------+------------------+------------------+
|CustomerID|     avg(Quantity)|    avg(UnitPrice)|
+----------+------------------+------------------+
|     16250| 8.666666666666666|             2.375|
|     15574|2.0773809523809526|               2.0|
|     15555| 4.767567567567568| 1.172972972972973|
|     15271| 4.570909090909091|              2.12|
|     17714|               9.2|               1.6|
|     17757|  4.46900269541779|2.1361185983827493|
|     17551| 3.488372093023256|1.9767441860465116|
|     13187|2.5675675675675675| 3.189189189189189|
|     16549|2.4108053007135575|1.5168195718654434|
|     12637| 7.588832487309645|2.0913705583756346|
|     15052|2.7333333333333334|2.8666666666666667|
|     14525|  8.93288590604027| 2.661073825503356|
|     18283| 1.847883597883598|1.0833333333333333|
|     13107|21.466666666666665|1.2833333333333334|
|     16303|15.694610778443113|2.2335329341317367|
|     13174| 4.442675159235669| 2.767515923566879|
|     13027| 664.6153846153846|

### Working with SQL

DataFrame and Spark SQL share the same execution engine so they can be interchangeably used seamlessly. 

In [22]:
df.createOrReplaceTempView("tableA")
spark.sql("SELECT count(*) from tableA").show()

+--------+
|count(1)|
+--------+
|  541909|
+--------+

