In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.conf import SparkConf
config = SparkConf()
# config.set("property", "value")
config.setMaster("local").setAppName("DataFrameBasic")

from pyspark.sql import SparkSession
# spark Session, entry point for Spark SQL, DataFrame
spark = SparkSession.builder\
                    .config(conf=config)\
                    .getOrCreate()

sc = spark.sparkContext

22/03/03 23:25:25 WARN Utils: Your hostname, ubuntu-virtual-machine resolves to a loopback address: 127.0.1.1; using 192.168.80.128 instead (on interface ens33)
22/03/03 23:25:25 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/03/03 23:25:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/03 23:25:27 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
# spark dataframe
# Structured Data
# data + schema
# schema will contain columns and data types
# Dataframe Core Engine, Spark SQL, Core are same
# DataFrame internally has RDD, Rdd[Row]
# DataFrame is alias, the actual data still on RDD only
# Data Frame is API, when we call the API, internally API is converted into 
# various plans [Logical, optimized, physical plans] and finally phyiscal plan
# used to create Java Byte using Scala
# When it comes to execution, it is still RDD, transformation, action only

In [4]:
products = [
    # (product_id, product_name, price, brand_id, offer)
    (1, 'iphone', 1000.0, 100, 0),
    (2, 'Galaxy', 545.5, 101, None),
    (3, 'Pixel', 645.99, 101, None)
]

# no data type mentioned, however we will let spark infer schema by reading data
schema = ['prodcut_id', 'product_name', 'price', 'brand_id', 'offer']

productDf = spark.createDataFrame(data=products, schema=schema)

# every data frame has schema, we can print it
productDf.printSchema()
productDf.show() # will print 20 records

root
 |-- prodcut_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- brand_id: long (nullable = true)
 |-- offer: long (nullable = true)



[Stage 0:>                                                          (0 + 1) / 1]

+----------+------------+------+--------+-----+
|prodcut_id|product_name| price|brand_id|offer|
+----------+------------+------+--------+-----+
|         1|      iphone|1000.0|     100|    0|
|         2|      Galaxy| 545.5|     101| null|
|         3|       Pixel|645.99|     101| null|
+----------+------------+------+--------+-----+



                                                                                

In [5]:
# every data frame has rdd internally
# data is nothing but api applied on rdd
# DataFrame(DF) is RDD of Row, each has column name, value
productDf.rdd.collect()

[Row(prodcut_id=1, product_name='iphone', price=1000.0, brand_id=100, offer=0),
 Row(prodcut_id=2, product_name='Galaxy', price=545.5, brand_id=101, offer=None),
 Row(prodcut_id=3, product_name='Pixel', price=645.99, brand_id=101, offer=None)]

In [6]:
# dataframe rdd partitions
productDf.rdd.getNumPartitions()

1

In [7]:
# data frame has transformation and actions
# transformations will return dataframe which is immutable
# transformation are lazy
# data frame filter
# return a new data dataframe, it wont execute the data, no job no actiob
df = productDf.filter(productDf["price"] <= 750)

In [8]:
# apply actions show is action
df.printSchema()
df.show() # execute job

root
 |-- prodcut_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- brand_id: long (nullable = true)
 |-- offer: long (nullable = true)

+----------+------------+------+--------+-----+
|prodcut_id|product_name| price|brand_id|offer|
+----------+------------+------+--------+-----+
|         2|      Galaxy| 545.5|     101| null|
|         3|       Pixel|645.99|     101| null|
+----------+------------+------+--------+-----+



In [9]:
# selct api, projection
df = productDf.select("product_name", "price")
df.printSchema()
df.show()

root
 |-- product_name: string (nullable = true)
 |-- price: double (nullable = true)

+------------+------+
|product_name| price|
+------------+------+
|      iphone|1000.0|
|      Galaxy| 545.5|
|       Pixel|645.99|
+------------+------+



In [10]:
# selectExpr - dynamic expression, CAST
df = productDf.selectExpr("product_name", "upper(product_name)", 
                            "price", "price * 0.9")

df.printSchema()
df.show()

root
 |-- product_name: string (nullable = true)
 |-- upper(product_name): string (nullable = true)
 |-- price: double (nullable = true)
 |-- (price * 0.9): double (nullable = true)

+------------+-------------------+------+-----------------+
|product_name|upper(product_name)| price|    (price * 0.9)|
+------------+-------------------+------+-----------------+
|      iphone|             IPHONE|1000.0|            900.0|
|      Galaxy|             GALAXY| 545.5|           490.95|
|       Pixel|              PIXEL|645.99|581.3910000000001|
+------------+-------------------+------+-----------------+



In [11]:
df = productDf.selectExpr("product_name", "upper(product_name) as title", 
                            "price", "price * 0.9 as grand_total")

df.printSchema()
df.show()

root
 |-- product_name: string (nullable = true)
 |-- title: string (nullable = true)
 |-- price: double (nullable = true)
 |-- grand_total: double (nullable = true)

+------------+------+------+-----------------+
|product_name| title| price|      grand_total|
+------------+------+------+-----------------+
|      iphone|IPHONE|1000.0|            900.0|
|      Galaxy|GALAXY| 545.5|           490.95|
|       Pixel| PIXEL|645.99|581.3910000000001|
+------------+------+------+-----------------+



In [12]:
# dervied a new column called offer_price. adding new column from existing columns
df = productDf.withColumn("offer_price", productDf.price * 0.9)
df.printSchema()
df.show()

root
 |-- prodcut_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- brand_id: long (nullable = true)
 |-- offer: long (nullable = true)
 |-- offer_price: double (nullable = true)

+----------+------------+------+--------+-----+-----------------+
|prodcut_id|product_name| price|brand_id|offer|      offer_price|
+----------+------------+------+--------+-----+-----------------+
|         1|      iphone|1000.0|     100|    0|            900.0|
|         2|      Galaxy| 545.5|     101| null|           490.95|
|         3|       Pixel|645.99|     101| null|581.3910000000001|
+----------+------------+------+--------+-----+-----------------+



In [13]:
# rename column
df = productDf.withColumnRenamed("price", "total")
df.printSchema()
df.show()

root
 |-- prodcut_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- total: double (nullable = true)
 |-- brand_id: long (nullable = true)
 |-- offer: long (nullable = true)

+----------+------------+------+--------+-----+
|prodcut_id|product_name| total|brand_id|offer|
+----------+------------+------+--------+-----+
|         1|      iphone|1000.0|     100|    0|
|         2|      Galaxy| 545.5|     101| null|
|         3|       Pixel|645.99|     101| null|
+----------+------------+------+--------+-----+



In [14]:
# drop Columns
df = productDf.drop("brand_id")
df.printSchema()
df.show()                

root
 |-- prodcut_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- offer: long (nullable = true)

+----------+------------+------+-----+
|prodcut_id|product_name| price|offer|
+----------+------------+------+-----+
|         1|      iphone|1000.0|    0|
|         2|      Galaxy| 545.5| null|
|         3|       Pixel|645.99| null|
+----------+------------+------+-----+



In [15]:
# filter, where conditions
# filter and where are same, alias
# python expression
df = productDf.filter((productDf.price >= 500) & (productDf["price"] < 600))
df.printSchema()
df.show()

root
 |-- prodcut_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- brand_id: long (nullable = true)
 |-- offer: long (nullable = true)

+----------+------------+-----+--------+-----+
|prodcut_id|product_name|price|brand_id|offer|
+----------+------------+-----+--------+-----+
|         2|      Galaxy|545.5|     101| null|
+----------+------------+-----+--------+-----+



In [16]:
# filter and where are same
df = productDf.where((productDf.price >= 500) & (productDf["price"] < 600))
df.printSchema()
df.show()

root
 |-- prodcut_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- brand_id: long (nullable = true)
 |-- offer: long (nullable = true)

+----------+------------+-----+--------+-----+
|prodcut_id|product_name|price|brand_id|offer|
+----------+------------+-----+--------+-----+
|         2|      Galaxy|545.5|     101| null|
+----------+------------+-----+--------+-----+



In [17]:
# pyspark, filter, or where with sql expression
df = productDf.where("price >= 500 AND price < 600")
df.printSchema()
df.show()

root
 |-- prodcut_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- brand_id: long (nullable = true)
 |-- offer: long (nullable = true)

+----------+------------+-----+--------+-----+
|prodcut_id|product_name|price|brand_id|offer|
+----------+------------+-----+--------+-----+
|         2|      Galaxy|545.5|     101| null|
+----------+------------+-----+--------+-----+



In [18]:
# how to reference columns in pyspark
print(productDf.price)
print(productDf['price'])

# with function col - column
from pyspark.sql.functions import col
print(col('price'))

Column<b'price'>
Column<b'price'>
Column<b'price'>


In [19]:
# add a new column, which a fixed constant
from pyspark.sql.functions import lit # lit - literal - constant
df = productDf.withColumn('qty', lit(4))\
                .withColumn("amount", col("qty") * col("price"))
df.printSchema()
df.show()

root
 |-- prodcut_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- brand_id: long (nullable = true)
 |-- offer: long (nullable = true)
 |-- qty: integer (nullable = false)
 |-- amount: double (nullable = true)

+----------+------------+------+--------+-----+---+-------+
|prodcut_id|product_name| price|brand_id|offer|qty| amount|
+----------+------------+------+--------+-----+---+-------+
|         1|      iphone|1000.0|     100|    0|  4| 4000.0|
|         2|      Galaxy| 545.5|     101| null|  4| 2182.0|
|         3|       Pixel|645.99|     101| null|  4|2583.96|
+----------+------------+------+--------+-----+---+-------+



In [20]:
# sort data ascending order
df = productDf.sort("price")
df.show()

+----------+------------+------+--------+-----+
|prodcut_id|product_name| price|brand_id|offer|
+----------+------------+------+--------+-----+
|         2|      Galaxy| 545.5|     101| null|
|         3|       Pixel|645.99|     101| null|
|         1|      iphone|1000.0|     100|    0|
+----------+------------+------+--------+-----+



In [21]:
# sorting data decending order
from pyspark.sql.functions import desc
df = productDf.sort(desc("price"))
df.show()

+----------+------------+------+--------+-----+
|prodcut_id|product_name| price|brand_id|offer|
+----------+------------+------+--------+-----+
|         1|      iphone|1000.0|     100|    0|
|         3|       Pixel|645.99|     101| null|
|         2|      Galaxy| 545.5|     101| null|
+----------+------------+------+--------+-----+



In [22]:
# alternatively use dataframes columns if we have df reference
df = productDf.sort(productDf.price.asc())
df.show()
df = productDf.sort(productDf.price.desc())
df.show()

+----------+------------+------+--------+-----+
|prodcut_id|product_name| price|brand_id|offer|
+----------+------------+------+--------+-----+
|         2|      Galaxy| 545.5|     101| null|
|         3|       Pixel|645.99|     101| null|
|         1|      iphone|1000.0|     100|    0|
+----------+------------+------+--------+-----+

+----------+------------+------+--------+-----+
|prodcut_id|product_name| price|brand_id|offer|
+----------+------------+------+--------+-----+
|         1|      iphone|1000.0|     100|    0|
|         3|       Pixel|645.99|     101| null|
|         2|      Galaxy| 545.5|     101| null|
+----------+------------+------+--------+-----+



In [27]:
# fillNA
productDf.show()
df = productDf.fillna(value=0, subset=['offer']) # Null vaule replace with 0
df.show()

+----------+------------+------+--------+-----+
|prodcut_id|product_name| price|brand_id|offer|
+----------+------------+------+--------+-----+
|         1|      iphone|1000.0|     100|    0|
|         2|      Galaxy| 545.5|     101| null|
|         3|       Pixel|645.99|     101| null|
+----------+------------+------+--------+-----+

+----------+------------+------+--------+-----+
|prodcut_id|product_name| price|brand_id|offer|
+----------+------------+------+--------+-----+
|         1|      iphone|1000.0|     100|    0|
|         2|      Galaxy| 545.5|     101|    0|
|         3|       Pixel|645.99|     101|    0|
+----------+------------+------+--------+-----+

