In [16]:
from pyspark.sql import SparkSession
import getpass
username=getpass.getuser()
spark=SparkSession.\
    builder.\
    config('spark.ui.port','0').\
    config("spark.sql.warehouse.dir",f"/user/{username}/warehouse").\
    enableHiveSupport().\
    master('yarn').\
    getOrCreate()

In [2]:
df = spark.read \
.format("csv") \
.option("header","true") \
.option("inferSchema","true") \
.load("source_file_path")

In [3]:
df.show(5)

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
|       4|2013-07-25 00:00:...|       8827|         CLOSED|
|       5|2013-07-25 00:00:...|      11318|       COMPLETE|
+--------+--------------------+-----------+---------------+
only showing top 5 rows



In [5]:
df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



## Schema enforcement

In [6]:
df = spark.read \
.format("csv") \
.load("source_file_path")

In [7]:
df.show()

+---+----------+-----+---------------+
|_c0|       _c1|  _c2|            _c3|
+---+----------+-----+---------------+
|  1|2013-07-25|11599|         CLOSED|
|  2|2013-07-25|  256|PENDING_PAYMENT|
|  3|2013-07-25|12111|       COMPLETE|
|  4|2013-07-25| 8827|         CLOSED|
|  5|2013-07-25|11318|       COMPLETE|
|  6|2013-07-25| 7130|       COMPLETE|
|  7|2013-07-25| 4530|       COMPLETE|
|  8|2013-07-25| 2911|     PROCESSING|
|  9|2013-07-25| 5657|PENDING_PAYMENT|
| 10|2013-07-25| 5648|PENDING_PAYMENT|
+---+----------+-----+---------------+



In [5]:
#schema enforcement method1
orders_schema = 'order_id long, order_date date, cust_id long, order_status string'

In [6]:
df = spark.read \
.format("csv") \
.schema(orders_schema) \
.load("source_file_path")

In [11]:
df.show()

+--------+----------+-------+---------------+
|order_id|order_date|cust_id|   order_status|
+--------+----------+-------+---------------+
|       1|2013-07-25|  11599|         CLOSED|
|       2|2013-07-25|    256|PENDING_PAYMENT|
|       3|2013-07-25|  12111|       COMPLETE|
|       4|2013-07-25|   8827|         CLOSED|
|       5|2013-07-25|  11318|       COMPLETE|
|       6|2013-07-25|   7130|       COMPLETE|
|       7|2013-07-25|   4530|       COMPLETE|
|       8|2013-07-25|   2911|     PROCESSING|
|       9|2013-07-25|   5657|PENDING_PAYMENT|
|      10|2013-07-25|   5648|PENDING_PAYMENT|
+--------+----------+-------+---------------+



In [12]:
df.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: date (nullable = true)
 |-- cust_id: long (nullable = true)
 |-- order_status: string (nullable = true)



## Dealing with date format

In [8]:
orders_schema = 'order_id long, order_date date, cust_id long, order_status string'

In [18]:
df = spark.read \
.format("csv") \
.schema(orders_schema) \
.option("dateFormat", "MM-dd-yyyy") \
.load("source_file_path")

In [19]:
df.show()

+--------+----------+-------+---------------+
|order_id|order_date|cust_id|   order_status|
+--------+----------+-------+---------------+
|       1|2013-07-25|  11599|         CLOSED|
|       2|2013-07-25|    256|PENDING_PAYMENT|
|       3|2013-07-25|  12111|       COMPLETE|
|       4|2013-07-25|   8827|         CLOSED|
|       5|2013-07-25|  11318|       COMPLETE|
|       6|2013-07-25|   7130|       COMPLETE|
|       7|2013-07-25|   4530|       COMPLETE|
|       8|2013-07-25|   2911|     PROCESSING|
|       9|2013-07-25|   5657|PENDING_PAYMENT|
|      10|2013-07-25|   5648|PENDING_PAYMENT|
+--------+----------+-------+---------------+



## Using string 

In [47]:
orders_schema = 'order_id long, order_date string, cust_id long, order_status string'

In [17]:
from pyspark.sql.functions import *

In [49]:
df = spark.read \
.format("csv") \
.schema(orders_schema) \
.load("source_file_path")

In [50]:
new_date = df.withColumn("order_date_new",to_date("order_date","MM-dd-yyyy"))

In [52]:
new_date.show()

+--------+----------+-------+---------------+--------------+
|order_id|order_date|cust_id|   order_status|order_date_new|
+--------+----------+-------+---------------+--------------+
|       1|07-25-2013|  11599|         CLOSED|    2013-07-25|
|       2|07-25-2013|    256|PENDING_PAYMENT|    2013-07-25|
|       3|07-25-2013|  12111|       COMPLETE|    2013-07-25|
|       4|07-25-2013|   8827|         CLOSED|    2013-07-25|
|       5|07-25-2013|  11318|       COMPLETE|    2013-07-25|
|       6|07-25-2013|   7130|       COMPLETE|    2013-07-25|
|       7|07-25-2013|   4530|       COMPLETE|    2013-07-25|
|       8|07-25-2013|   2911|     PROCESSING|    2013-07-25|
|       9|07-25-2013|   5657|PENDING_PAYMENT|    2013-07-25|
|      10|07-25-2013|   5648|PENDING_PAYMENT|    2013-07-25|
+--------+----------+-------+---------------+--------------+



In [33]:
new_date.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- cust_id: long (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_date_new: date (nullable = true)



In [57]:
spark.stop()

## Types of mode

In [59]:
orders_schema = 'order_id long, order_date string, cust_id long, order_status string'

In [64]:
df = spark.read \
.format("csv") \
.schema(orders_schema) \
.option("mode", "dropmalformed") \
.load("/public/trendytech/datasets/orders_sample3.csv")

In [65]:
df.show()

+--------+----------+-------+---------------+
|order_id|order_date|cust_id|   order_status|
+--------+----------+-------+---------------+
|       1|2013-07-25|  11599|         CLOSED|
|       2|2013-07-25|    256|PENDING_PAYMENT|
|       3|2013-07-25|  12111|       COMPLETE|
|       4|2013-07-25|   8827|         CLOSED|
|       5|2013-07-25|  11318|       COMPLETE|
|       6|2013-07-25|   7130|       COMPLETE|
|       8|2013-07-25|   2911|     PROCESSING|
|      10|2013-07-25|   5648|PENDING_PAYMENT|
+--------+----------+-------+---------------+



## Ways of creating dataframe

In [66]:
# 1 is spark.read()

In [67]:
# 2 is spark.sql()

In [68]:
# 3 is spark.table()

In [69]:
# 4 is spark.range()

In [70]:
# 5 is spark.createDataFrame

##  how to deal with nested schema

In [78]:
ddlschema = "customer_id long, fullname struct<firstname:string, lastname:string>, city string"

In [79]:
df = spark.read \
.format("json") \
.schema(ddlschema) \
.load("source_file_path")

In [80]:
df.show()

+-----------+----------------+---------+
|customer_id|        fullname|     city|
+-----------+----------------+---------+
|          2|    {ram, kumar}|hyderabad|
|          3|{vijay, shankar}|     pune|
|          1| {sumit, mittal}|bangalore|
+-----------+----------------+---------+



In [76]:
spark.stop()

## select v/s selectExpr

In [82]:
raw_df = spark.read \
.format("csv") \
.option("inferschema", "true") \
.load("source_file_path")

In [85]:
col_df = raw_df.toDF("order_item_id","order_id","product_id","quantity","subtotal","product_price")

In [88]:
col_df.show(5)

+-------------+--------+----------+--------+--------+-------------+
|order_item_id|order_id|product_id|quantity|subtotal|product_price|
+-------------+--------+----------+--------+--------+-------------+
|            1|       1|       957|       1|  299.98|       299.98|
|            2|       2|      1073|       1|  199.99|       199.99|
|            3|       2|       502|       5|   250.0|         50.0|
|            4|       2|       403|       1|  129.99|       129.99|
|            5|       4|       897|       2|   49.98|        24.99|
+-------------+--------+----------+--------+--------+-------------+
only showing top 5 rows



In [89]:
df = col_df.drop("subtotal")

In [90]:
df.show(5)

+-------------+--------+----------+--------+-------------+
|order_item_id|order_id|product_id|quantity|product_price|
+-------------+--------+----------+--------+-------------+
|            1|       1|       957|       1|       299.98|
|            2|       2|      1073|       1|       199.99|
|            3|       2|       502|       5|         50.0|
|            4|       2|       403|       1|       129.99|
|            5|       4|       897|       2|        24.99|
+-------------+--------+----------+--------+-------------+
only showing top 5 rows



In [None]:
df.select("*", "product_price * quantity as subtotal").show()

In [94]:
# using expr
df.select("*",expr("product_price * quantity as subtotal")).show()

+-------------+--------+----------+--------+-------------+--------+
|order_item_id|order_id|product_id|quantity|product_price|subtotal|
+-------------+--------+----------+--------+-------------+--------+
|            1|       1|       957|       1|       299.98|  299.98|
|            2|       2|      1073|       1|       199.99|  199.99|
|            3|       2|       502|       5|         50.0|   250.0|
|            4|       2|       403|       1|       129.99|  129.99|
|            5|       4|       897|       2|        24.99|   49.98|
|            6|       4|       365|       5|        59.99|  299.95|
|            7|       4|       502|       3|         50.0|   150.0|
|            8|       4|      1014|       4|        49.98|  199.92|
|            9|       5|       957|       1|       299.98|  299.98|
|           10|       5|       365|       5|        59.99|  299.95|
|           11|       5|      1014|       2|        49.98|   99.96|
|           12|       5|       957|       1|    

In [95]:
#using selectExpr
df.selectExpr("*","product_price * quantity as subtotal").show()

+-------------+--------+----------+--------+-------------+--------+
|order_item_id|order_id|product_id|quantity|product_price|subtotal|
+-------------+--------+----------+--------+-------------+--------+
|            1|       1|       957|       1|       299.98|  299.98|
|            2|       2|      1073|       1|       199.99|  199.99|
|            3|       2|       502|       5|         50.0|   250.0|
|            4|       2|       403|       1|       129.99|  129.99|
|            5|       4|       897|       2|        24.99|   49.98|
|            6|       4|       365|       5|        59.99|  299.95|
|            7|       4|       502|       3|         50.0|   150.0|
|            8|       4|      1014|       4|        49.98|  199.92|
|            9|       5|       957|       1|       299.98|  299.98|
|           10|       5|       365|       5|        59.99|  299.95|
|           11|       5|      1014|       2|        49.98|   99.96|
|           12|       5|       957|       1|    

In [18]:
product_df = spark.read \
.format("csv") \
.option("inferschema", "true") \
.load("source_file_path")

In [19]:
product_df.show(1)

+---+---+--------------------+----+-----+--------------------+
|_c0|_c1|                 _c2| _c3|  _c4|                 _c5|
+---+---+--------------------+----+-----+--------------------+
|  1|  2|Quest Q64 10 FT. ...|null|59.98|http://images.acm...|
+---+---+--------------------+----+-----+--------------------+
only showing top 1 row



In [20]:
df1 = product_df.toDF("poduct_id","poduct_category_id", "poduct_name", "poduct_description", "poduct_price", "poduct_image")

In [21]:
df1.show(1)

+---------+------------------+--------------------+------------------+------------+--------------------+
|poduct_id|poduct_category_id|         poduct_name|poduct_description|poduct_price|        poduct_image|
+---------+------------------+--------------------+------------------+------------+--------------------+
|        1|                 2|Quest Q64 10 FT. ...|              null|       59.98|http://images.acm...|
+---------+------------------+--------------------+------------------+------------+--------------------+
only showing top 1 row



In [22]:
df2 = df1.withColumn("poduct_price", expr("poduct_price * 1.2"))

In [23]:
df2.show()

+---------+------------------+--------------------+------------------+------------------+--------------------+
|poduct_id|poduct_category_id|         poduct_name|poduct_description|      poduct_price|        poduct_image|
+---------+------------------+--------------------+------------------+------------------+--------------------+
|        1|                 2|Quest Q64 10 FT. ...|              null|            71.976|http://images.acm...|
|        2|                 2|Under Armour Men'...|              null|           155.988|http://images.acm...|
|        3|                 2|Under Armour Men'...|              null|107.98799999999999|http://images.acm...|
|        4|                 2|Under Armour Men'...|              null|107.98799999999999|http://images.acm...|
|        5|                 2|Riddell Youth Rev...|              null|           239.988|http://images.acm...|
|        6|                 2|Jordan Men's VI R...|              null|           161.988|http://images.acm...|
|

## How to remove duplicate record from dataframe

In [24]:
my_list = [
    (1, "A", 34),
    (1, "A", 34),
    (1, "B", 26),
    (2, "B", 26),
]


In [25]:
df = spark.createDataFrame(my_list).toDF("id","name","age")

In [26]:
df.show()

+---+----+---+
| id|name|age|
+---+----+---+
|  1|   A| 34|
|  1|   A| 34|
|  1|   B| 26|
|  2|   B| 26|
+---+----+---+



In [28]:
df2 = df.distinct().show()

+---+----+---+
| id|name|age|
+---+----+---+
|  1|   B| 26|
|  1|   A| 34|
|  2|   B| 26|
+---+----+---+



## creating spark session

In [1]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession. \
builder. \
appName("spark session demo"). \
config("spark.sql.warehouse.dir","{username}warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()