## Programs 

### Sample Program - 00 : parallelize()

In [None]:
# Importing necessary packages
from pyspark.sql import SparkSession

# Creating a pyspark session and naming the app and giving the configurations
spark = SparkSession.builder.appName("Python").config("spark.some.config.option", "some-value").getOrCreate()

# Creating RDD and converting it as a DataFrame in single line
df = spark.sparkContext.parallelize([("Alice", 25), ("Bob", 23), ("Charlie", 35)]).toDF(["Name", "Age"])

# Print the DF
df.show()

# Stop the spark Session
spark.stop()

### Sample Program - 01 : createDataFrame() from RDD

In [None]:
"""
    Program which create a DataFrame from python list and show, as well as print the structure of DataFrame
"""

# Import PySpark
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \
        .appName("PySpark-Get-Started") \
        .getOrCreate()

# Creating a List
data = [("Alice", 25), ("Bob", 23), ("Charlie", 35)]

# Creating DataFrame
df = spark.createDataFrame(data, ["Name", "Age"])

# Print some info
df.show(3)          # Default 20 rows
df.printSchema()    # Print the DF schema

# Stoping the SparkSession
spark.stop()

### Sample Program - 02 : createDataFrame() from python list

In [None]:
"""
    Program which create a DataFrame from python list
"""

# Importing necessary packages
from pyspark.sql import SparkSession

# Creating a pyspark session and naming the app and giving the configurations
spark = SparkSession.builder.appName("Python").config("spark.some.config.option", "some-value").getOrCreate()

# Create DataFrame
data = [('James','','Smith','1991-04-01','M',3000),
  ('Michael','Rose','','2000-05-19','M',4000),
  ('Robert','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jen','Mary','Brown','1980-02-17','F',-1)
]

# Column names
columns = ["firstname","middlename","lastname","dob","gender","salary"]

# Creating DataFrame from the python list directly
df = spark.createDataFrame(data=data, schema = columns)

# Stoping the SparkSession
spark.stop()


### Sample Program - 03 : parallelize()

In [None]:
"""
    Program which create a RDD using parallelize() and uses collect() action
"""

# Importing necessary packages
from pyspark.sql import SparkSession

# Creating a pyspark session and naming the app and giving the configurations
spark = SparkSession.builder.appName("Python").config("spark.some.config.option", "some-value").getOrCreate()

# Creating an RDD
mydata = spark.sparkContext.parallelize([(1,2),(3,4),(5,6),(7,8),(9,10)])

# Using the action
mydata.collect()

# Stoping the SparkSession
spark.stop()

### Sample Program - 04 : pyspark.SparkContext() and count()

In [None]:
"""
    Program which create a RDD using parallelize() using SparkContext without SparkSession and uses count() action
"""

# Importing necessary packages
from pyspark import SparkContext

# Create SparkContext
sc = SparkContext("local", "count app")

# Creating a RDD using parallelize
words = sc.parallelize (
   ["scala", 
   "java", 
   "hadoop", 
   "spark", 
   "akka",
   "spark vs hadoop", 
   "pyspark",
   "pyspark and spark"]
)

# Using the count() action
counts = words.count()

# Print the output
print(f"The total count was : {counts}")

# Stoping the SparkSession
sc.stop()

### Sample Program - 05 : DataFrame using external source (.csv file) via format()

In [1]:
"""
    Loading data from external source .csv file using format() and creating DataFrame
"""

# Importing necessary packages
from pyspark.sql import SparkSession

# Creating a Spark session and naming the app
spark = SparkSession.builder \
    .appName("Python") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

# Creating a DataFrame by reading data from a .csv file
df = spark.read.format("csv") \
    .options(header='true', inferSchema='true') \
    .load("path/to/your/file.csv")  # Provide the correct path to your CSV file

# Print the DataFrame
df.show(5)  # Only shows 5 rows

# Print the schema of the DataFrame
df.printSchema()

# Stoping the SparkSession
spark.stop()

24/08/09 01:10:16 WARN Utils: Your hostname, neon-HP-Pavilion-Gaming-Laptop-15-ec1xxx resolves to a loopback address: 127.0.1.1; using 192.168.1.11 instead (on interface wlo1)
24/08/09 01:10:16 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/09 01:10:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/home/neon/Essentials/Jupyter/path/to/your/file.csv.

### Sample Program - 06 : first(), take(), foreach()

In [None]:
"""
    Program which create a RDD and retrive first and N elements
"""

# Importing necessary packages
from pyspark.sql import SparkSession

# Creating a pyspark session and naming the app and giving the configurations
spark = SparkSession.builder.appName("Python").config("spark.some.config.option", "some-value").getOrCreate()

# Create DataFrame
data = ["scala", 
   "java", 
   "hadoop", 
   "spark", 
   "akka",
   "spark vs hadoop", 
   "pyspark",
   "pyspark and spark"]

# Converting the data to RDD
rdd = spark.sparkContext.parallelize(data)

# Taking first element of the RDD and printing it
first_element = rdd.first()
print(f'The first element was = {first_element}')

# Taking some number of element from RDD and printing it
elements_taken = rdd.take(2)
print(f'The elements taken was = {elements_taken}')

# Printing all the elements using lambda and foreach
rdd.foreach(lambda x: print(x))

# Stoping the SparkSession
spark.stop()

### Sample Program - 07 : map, filter, sort and reducebykey

In [None]:
"""
    Program which create a RDD and use map, reduce and reducebykey transformation
"""

# Importing necessary packages
from pyspark.sql import SparkSession

# Creating a pyspark session and naming the app and giving the configurations
spark = SparkSession.builder.appName("Python").config("spark.some.config.option", "some-value").getOrCreate()

# Create DataFrame
data = [("Alice", 25), ("Bob", 23), ("Charlie", 35), ("Bob", 22)]

# Converting the data to RDD
rdd = spark.sparkContext.parallelize(data)

# New RDD was created, because we used the map transformation map here
mapped_rdd = rdd.map(lambda x: (x[0].upper(), x[1]))

# New RDD was created, because we used the map transformation filter here
filtered_rdd = rdd.filter(lambda x: x[1] < 30)

# New RDD was created, because we used the map transformation reduceByKey here
reduced_rdd = rdd.reduceByKey(lambda x, y: x + y)

# New RDD was created, because we used the map transformation sortBy here
sorted_rdd = rdd.sortBy(lambda x: x[1], ascending = False)

# This is a RDD to print this we need to do collect() 
print(mapped_rdd)
print(filtered_rdd)
print(reduced_rdd)
print(sorted_rdd)

# Printing the actual data using collect() action
print(f'The final mapped data was : {mapped_rdd.collect()}')
print(f'The final filtered data was : {filtered_rdd.collect()}')
print(f'The final reduced data was : {reduced_rdd.collect()}')
print(f'The final sorted data was : {sorted_rdd.collect()}')

# Stop the spark session
spark.stop()

### Sample Program - 08 : Saving RDD result using saveAsTextFile() and retrieving using sc.textFile()

In [None]:
"""
    Program which stores RDD into file and retrieve the same and print it
"""

# Importing necessary packages
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \
        .appName("PySpark-Get-Started") \
        .getOrCreate()

# Creating a RDD using parallelize
rdd = spark.sparkContext.parallelize (
   ["scala", 
   "java", 
   "hadoop", 
   "spark", 
   "akka",
   "spark vs hadoop", 
   "pyspark",
   "pyspark and spark"]
)

# Saving the RDD in Text file
rdd.saveAsTextFile("./Dependencies/output.txt")

# Retrieving the same RDD by reading from Text file
rdd_text = spark.sparkContext.textFile("./Dependencies/output.txt")

# Printing the content of the RDD file
print(f'The content of the file : {rdd_text.collect()}')

# Stop the SparkSession
spark.stop()

### Sample Program - 09 : Some Operations on RDD as well as DF

In [None]:
"""
    The program which perform some operations on RDD and DF
"""

from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \
        .appName("DF-Creation") \
        .getOrCreate()

# Create an RDD (spark session initiated sparkContext)
rdd = spark.sparkContext.textFile("./Dependencies/data.txt")

# Flat the RDD, make each word in each sentence as a list (like 1D array)
# Get Mapped to the RDD here
# Remove duplicates as same key will be processed here
# Sort in decending order carried out here

result_rdd = rdd.flatMap(lambda line: line.split(" ")) \
             .map(lambda word: (word, 1)) \
             .reduceByKey(lambda x, y: x + y) \
             .sortBy(lambda x: x[1], ascending = False)    

# Printing the resultant top 10 words
print(f"The resultant set was : {result_rdd.take(10)}")

# Create an DF (spark session initiated sqlContext)
df = spark.read.text("/home/neon/Essentials/Jupyter/data.txt")

# SQL like format to interact with DF
result_df = df.selectExpr("explode(split(value, ' ')) as word") \
              .groupBy("word").count() \
              .orderBy("count")  # .orderBy(desc("count"))

# Printing the resultant top 5 words
print(f"The resultant set was : {result_df.take(5)}")

# Stop the SparkSession
spark.stop()

### Sample Program - 10 : .CSV files

In [None]:
# %%bash
# head -10 ./products.csv

"""
    The program for play around with .csv files
"""

from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \
        .appName("DF-Creation") \
        .getOrCreate()

df = spark.read.csv("./Dependencies/products.csv", header=True)

df.printSchema()

# For the correct datatype detection
df = spark.read.csv("./Dependencies/products.csv", header=True, inferSchema=True)

df.printSchema()

# Explicit schema ..
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

# The schema we going to use ..
schema = StructType([
	StructField(name="id", dataType=IntegerType(), nullable=True),
	StructField(name="name", dataType=StringType(), nullable=True),
	StructField(name="category", dataType=StringType(), nullable=True),
	StructField(name="quantity", dataType=IntegerType(), nullable=True),
	StructField(name="price", dataType=DoubleType(), nullable=True)
])

# Using the schame to modify the default one
df = spark.read.csv("./Dependencies/products.csv", header=True, schema=schema)

df.printSchema()

spark.stop()

### Sample Program - 11 : .Json files

In [None]:
"""
    The program for play around with .json files
"""

from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \
        .appName("DF-Creation") \
        .getOrCreate()

# Single line json file
df = spark.read.json("./Dependencies/products_singleline.json")
df.printSchema()

# Multi line json file
df = spark.read.json("./Dependencies/products_multiline.json", multiLine=True)
df.printSchema()

# Write the data into .parquet file
df.write.parquet("./Dependencies/products_parquet.parquet")

# Read the data from .parquet file
df_data = spark.read.parquet("./Dependencies/products_parquet.parquet")

df_data.printSchema()
df_data.show(5)

spark.stop()

### Sample Program - 12 : Few operations on DataFrames

In [None]:
"""
    Program which demonstrate few operations on DataFrames
"""

from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .appName("DF-Operations") \
        .getOrCreate()

df = spark.read.csv("./Dependencies/stocks.txt", header=True, inferSchema=True)

df.printSchema()

# Select specific column
selected_column = df.select('id', 'name')
selected_column.show(10)

# Filter it respected to the given condition
filtered_column = df.filter(df.quantity > 20)
filtered_column.show(10)

# GroupBy()
grouped_column = df.groupBy("category").agg({"quantity": "sum", "price": "avg"})
grouped_column.show()

# Join() using ID
df2 = df.select("id", "category").limit(10)
joined_column = df.join(df2, "id", "inner")
joined_column.show()

# OrderBy sorting but single column
sorted_column = df.orderBy("price")
sorted_column.show(10)

# Sorting but multi column
from pyspark.sql.functions import col, desc
sorted_column_data = df.orderBy(col("price").desc(), col("id").desc())
sorted_column_data.show(10)

# Getting Unique rows
unique_column = df.select("category").distinct()
unique_column.show()

# Remove / deleting / dropping the columns
retained_column = df.drop("quantity", "category")
retained_column.show(10)

# Add a new column to the table
data_with_new_column = df.withColumn("revenue", df.quantity * df.price)
data_with_new_column.show(10)

# Renaming the column
renamed_column = df.withColumnRenamed("price", "cost")
renamed_column.show(10)

# Stop the spark session
spark.stop()



### Sample Program - 13 : Spark SQL

In [None]:
"""
    Using Spark SQL concept, using SQL queries by registering as TempView
"""

from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \
        .appName("DF-SQL-Operation") \
        .getOrCreate()

df = spark.read.csv("./Dependencies/persons.csv", header=True, inferSchema=True)

df.printSchema()
df.show(5)

# Register temporary table (It will be registered in the spark session, we can access it using spark session object)
df.createOrReplaceTempView("my_table")

# Run SQL queries over the created temp view table which is registered under spark session
result = spark.sql("select * from my_table where age > 25")
result.show()

# To check whether the table with this name exists or not under the mentioned spark session
print(spark.catalog.tableExists("my_table"))

# Drop the temp view table
spark.catalog.dropTempView("my_table")
print(spark.catalog.tableExists("my_table"))

# SUB QUERIES
# -----------

# Create new DataFrames
employee_data = [
    (1, "John"), (2, "Alice"), (3, "Bob"), (4, "Emily"),
    (5, "David"), (6, "Sarah"), (7, "Michael"), (8, "Lisa"),
    (9, "William")]

employees = spark.createDataFrame(employee_data, ["id", "name"])

salary_data = [
    ("HR", 1, 60000), ("HR", 2, 55000), ("HR", 3, 58000),
    ("IT", 4, 70000), ("IT", 5, 72000), ("IT", 6, 68000),
    ("Sales", 7, 75000), ("Sales", 8, 78000), ("Sales", 9, 77000)]

salaries = spark.createDataFrame(salary_data, ["department", "id", "salary"])

# Register as temporary views
employees.createOrReplaceTempView("employees")
salaries.createOrReplaceTempView("salaries")

# Subquery to find employees with salaries above average
result = spark.sql("""
    SELECT name
    FROM employees
    WHERE id IN (
        SELECT id
        FROM salaries
        WHERE salary > (SELECT AVG(salary) FROM salaries)
    )
""")

result.show()

spark.stop()

### Sample Program - 14 : Windows Functions

In [None]:
"""
    Windows Functions
"""

from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql import functions as F

# Create a SparkSession
spark = SparkSession.builder \
        .appName("DF-SQL-Operation") \
        .getOrCreate()

employee_salary = spark.sql("""
    select  salaries.*, employees.name
    from salaries 
    left join employees on salaries.id = employees.id
""")

employee_salary.show()

# Create a window specification
window_spec = Window.partitionBy("department").orderBy(F.desc("salary"))

# Calculate the rank of employees within each department based on salary
employee_salary.withColumn("rank", F.rank().over(window_spec)).show()

# Stop the SparkSession
spark.stop()

### Sample Program - 15 : BroadCast

In [None]:
"""
    BroadCast
"""

# Importing neccessary packages
from pyspark.sql import SparkSession

# Creating the spark session
spark = SparkSession.builder \
        .appName("Shared-Variables") \
        .getOrCreate()

# Creating a broadcast variable
words_new = spark.sparkContext.broadcast(["scala", "java", "hadoop", "spark", "akka"])

# Assinging the broadcast value to a new variable
data = words_new.value 

# Fetching a specific value and printing
print(f"Printing a particular element in RDD = {words_new.value[2]}")

# Stoping the spark session
spark.stop()

### Sample Program - 16 : Accumulator

In [None]:
"""
    Accumulator : aggregating the things using associative and cumulative properties / expression
"""

# Importing neccessary packages
from pyspark.sql import SparkSession

# Creating the spark session
spark = SparkSession.builder \
        .appName("Shared-Variables") \
        .getOrCreate()

# Create a accumulator value named 'num'
num = spark.sparkContext.accumulator(10) 

# Defining the function for each element execution
def f(x): 
   global num 
   num+=x 

# Creating the RDD using parallelize
rdd = sc.parallelize([20,30,40,50]) 

# Using foreach to loop and iterate each element
rdd.foreach(f) 

# Accessing the num (shared variable)
final = num.value 

# Printing the final result
print(f"Accumulated value is : {final}")

# Stop the spark session
spark.stop()

### Sample Program - 17 : Union

In [21]:
"""
    Union
"""

# Importing neccessary packages
from pyspark.sql import SparkSession, Row

# Creating the spark session
spark = SparkSession.builder \
        .appName("Union-Rows") \
        .getOrCreate()

person_row = Row(101, "Robert", "Ownes", ["Men in Black III", "Home Alone"] ,4300.64, "http://someimage.com", "1964-08-18", True)
person_rows_list = [Row(102, "Kenny", "Bobien", ["Men in Black II", "Home Alone"] ,4300.64, "http://someimage.com", "1964-08-18", True),
                     Row(103, "Sara", "Devine", ["Men in Black I", "Home Alone"] ,4300.64, "http://someimage.com", "1964-08-18", True)]

person_rows_list.append(person_row)

headers = ['id', 'first_name', 'last_name', 'fav_movie', 'salary', 'image_url', 'date_of_birth', 'active']

new_person_df = spark.createDataFrame(person_rows_list, headers)

new_person_df.show(truncate=False)

# Using Union : we can combine the new DF with existing DF
union_df = new_person_df.union(new_person_df)

union_df.show(truncate=False)

spark.stop()

+---+----------+---------+------------------------------+-------+--------------------+-------------+------+
|id |first_name|last_name|fav_movie                     |salary |image_url           |date_of_birth|active|
+---+----------+---------+------------------------------+-------+--------------------+-------------+------+
|102|Kenny     |Bobien   |[Men in Black II, Home Alone] |4300.64|http://someimage.com|1964-08-18   |true  |
|103|Sara      |Devine   |[Men in Black I, Home Alone]  |4300.64|http://someimage.com|1964-08-18   |true  |
|101|Robert    |Ownes    |[Men in Black III, Home Alone]|4300.64|http://someimage.com|1964-08-18   |true  |
+---+----------+---------+------------------------------+-------+--------------------+-------------+------+

+---+----------+---------+------------------------------+-------+--------------------+-------------+------+
|id |first_name|last_name|fav_movie                     |salary |image_url           |date_of_birth|active|
+---+----------+---------+-

### Sample Program - 18 : User Defined Functions

In [10]:
"""
    Spark User Defined Functions were poorly optimized, we need to ignore as much as possible.
"""

# Importing neccessary packages
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col

# Creating the spark session
spark = SparkSession.builder \
        .appName("User-Func") \
        .getOrCreate()

data = [('swami', 85), ('nathan', 95), ('rahul', 55)]

df = spark.createDataFrame(data, schema=['Name', 'Mark'])

def gradder(mark: int) -> str:
    
    grade = ''
    if mark > 100:
        return 'Cheating'
    elif mark > 90:
        return 'A'
    elif mark > 80:
        return 'B'
    elif mark > 70:
        return 'C'
    else:
        return 'F'

gradderUDF = udf(gradder)

df_grades = df.select("Name", "Mark", gradderUDF(col("Mark")).alias("grade"))

df_grades.show()

spark.stop()

+------+----+-----+
|  Name|Mark|grade|
+------+----+-----+
| swami|  85|    B|
|nathan|  95|    A|
| rahul|  55|    F|
+------+----+-----+



### Sample Program - 19 : Renaming a column in DataFrame

In [19]:
"""
    Renaming a column in DataFrame
"""

# Importing neccessary packages
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, countDistinct

# Creating the spark session
spark = SparkSession.builder \
        .appName("Aggregation") \
        .getOrCreate()

flight_file = './Dependencies/flight-summary.csv'
flight_summary_df = spark.read.csv(flight_file, header=True, inferSchema=True)

flight_summary_df = flight_summary_df.withColumnRenamed('count', 'flight_count')

flight_summary_df.printSchema()
flight_summary_df.show(5)

flight_summary_df.select(count("dest_airport"), countDistinct("dest_airport")).show()

spark.stop()

root
 |-- origin_code: string (nullable = true)
 |-- origin_airport: string (nullable = true)
 |-- origin_city: string (nullable = true)
 |-- origin_state: string (nullable = true)
 |-- dest_code: string (nullable = true)
 |-- dest_airport: string (nullable = true)
 |-- dest_city: string (nullable = true)
 |-- dest_state: string (nullable = true)
 |-- flight_count: integer (nullable = true)

+-----------+--------------------+------------+------------+---------+--------------------+---------+----------+------------+
|origin_code|      origin_airport| origin_city|origin_state|dest_code|        dest_airport|dest_city|dest_state|flight_count|
+-----------+--------------------+------------+------------+---------+--------------------+---------+----------+------------+
|        BQN|Rafael Hernández ...|   Aguadilla|          PR|      MCO|Orlando Internati...|  Orlando|        FL|         441|
|        PHL|Philadelphia Inte...|Philadelphia|          PA|      MCO|Orlando Internati...|  Orlando|

### Sample Program - 20 : Catch and Persists

In [None]:
# How to cache data in memory 
df_from_csv.cache()

# How to persist data in local disk 
df_from_csv.persist(storageLevel=StorageLevel.DISK_ONLY)

ModuleNotFoundError: No module named 'org'

## Test Programs

### Test Program - 01 

#### Challenge : Daily Temperature (F to C conversion)

In [23]:
# Importing necessary packages
from pyspark.sql import SparkSession

# Creating the spark session
spark = SparkSession.builder \
        .appName("Daily-Temp") \
        .getOrCreate()

# Python list as a data
fahrenheit = [('Day1', 59), ('Day2',57.2), ('Day3',53.6), ('Day4',55.4), ('Day5',51.8), ('Day6',53.6), ('Day7',55.4)]

# Creating an RDD using parallelize()
rdd = spark.sparkContext.parallelize(fahrenheit)

# Converting the Fahrenheit to Celsius
mapped_rdd = rdd.map(lambda x: (x[0], (x[1] - 32) * (5 / 9)))

# This is list, collect() gives list not an RDD
rdd_result = mapped_rdd.collect()

# Print the List
for day, value in rdd_result:
    print(f"In {day} = {value:.2f} C")

# Create new RDD which has only numbers
new_rdd = mapped_rdd.map(lambda x: x[1])

print(new_rdd.collect())

# Fetch the value equal or greater than 13
reduced_rdd = mapped_rdd.filter(lambda x: x[1] >= 13)

# Print the results
print(f"Around {reduced_rdd.count()} days have Temperature greater then 13 C\nThey were : {reduced_rdd.collect()}")

# Stop the spark session
spark.stop()

In Day1 = 15.00 C
In Day2 = 14.00 C
In Day3 = 12.00 C
In Day4 = 13.00 C
In Day5 = 11.00 C
In Day6 = 12.00 C
In Day7 = 13.00 C
[15.0, 14.000000000000002, 12.000000000000002, 13.0, 10.999999999999998, 12.000000000000002, 13.0]
Around 4 days have Temperature greater then 13 C
They were : [('Day1', 15.0), ('Day2', 14.000000000000002), ('Day4', 13.0), ('Day7', 13.0)]


### Test Program - 02

#### Challenge : Preprocessing and Cleansing

In [58]:
# Importing necessary packages
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, to_timestamp, year, month
from pyspark.sql.types import IntegerType, FloatType

# Creating the spark session
spark = SparkSession.builder \
        .appName("Data-Preprocessing-Cleansing") \
        .getOrCreate()

# Creating the DF throught the external file
df = spark.read.csv("./Dependencies/Sales_data.csv", header=True, inferSchema=True)

# Checking whether data loaded correctly or not
df.select("Product").distinct().show(10, truncate=False)

# Checking whether Null value exists or not 
df.filter(col("Order ID").isNull() == True).show(10)

# To remove Null values / Bad data value removed
not_null_df = df.na.drop("any")

# Checking whether Null value exists or not 
not_null_df.filter(col("Product").isNull() == True).show(10)

# Checking basic info of all columns
not_null_df.describe("Order ID", "Product", "Quantity Ordered", "Price Each", "Order Date", "Purchase Address").show(10)

# Checking the Purchase Address
not_null_df.select("Purchase Address", split(col("Purchase Address"), ",").getItem(2)).show(10, truncate=False)

not_null_df = not_null_df.withColumn("City", split(col("Purchase Address"), ",").getItem(1)) \
                         .withColumn("State", split(split(col("Purchase Address"), ",").getItem(2), ' ').getItem(1))

not_null_df.show(10)

not_null_df = (not_null_df.withColumn("OrderID", col("Order ID").cast(IntegerType()))
                              .withColumn ("Quantity", col("Quantity Ordered").cast(IntegerType()))
                              .withColumn ("Price", col("Price Each").cast(FloatType()))
                              .withColumn ("OrderDate", to_timestamp(col("Order Date"), "MM/dd/yy HH:mm"))
                              .withColumnRenamed("Purchase Address", "StoreAddress")
                              .drop("Order ID")
                              .drop("Quantity Ordered")
                              .drop ("Price Each")
                              .drop ("Purchase Address"))

not_null_df = (not_null_df.withColumn ("ReportYear", year(col ("OrderDate")))
                              .withColumn ("Month", month(col("OrderDate"))))

not_null_df.show(10, truncate=True)

output_file = "./Dependencies/Sales_output_data"

not_null_df.write.mode("overwrite").partitionBy("ReportYear", "Month").parquet(output_file)

# Stopping the spark session
spark.stop()

+--------------------------+
|Product                   |
+--------------------------+
|Wired Headphones          |
|Macbook Pro Laptop        |
|Apple Airpods Headphones  |
|iPhone                    |
|Lightning Charging Cable  |
|Bose SoundSport Headphones|
|USB-C Charging Cable      |
|AAA Batteries (4-pack)    |
|20in Monitor              |
|27in FHD Monitor          |
+--------------------------+
only showing top 10 rows

+--------+-------+----------------+----------+----------+----------------+
|Order ID|Product|Quantity Ordered|Price Each|Order Date|Purchase Address|
+--------+-------+----------------+----------+----------+----------------+
|    NULL|   NULL|            NULL|      NULL|      NULL|            NULL|
|    NULL|   NULL|            NULL|      NULL|      NULL|            NULL|
|    NULL|Product|            NULL|      NULL|Order Date|Purchase Address|
|    NULL|   NULL|            NULL|      NULL|      NULL|            NULL|
|    NULL|Product|            NULL|      NU

                                                                                

### Test Program - 03 

#### Challenge : Analysis

In [90]:
"""
"""

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, round, sum, collect_list

spark = SparkSession.builder.appName("SalesAnalytics").getOrCreate()

source_output_data = "./Dependencies/Sales_output_data/"
partition = "ReportYear=2019"

sales2019_df = spark.read.parquet(f"{source_output_data}{partition}")
sales2019_df.show(truncate=True)

new_sales2019_df = sales2019_df.select("OrderID", "Month", "Price", "Quantity", expr("Price * Quantity").alias("Sales"))

new_one = new_sales2019_df.groupBy("Month").agg(round(sum("Sales"),2).alias("TotalSale")) \
                .orderBy("TotalSale", ascending=False).limit(1).show()

sales2019_df.groupBy("City") \
            .agg(sum("Quantity").alias("Quan")) \
            .orderBy("Quan", ascending=False) \
            .limit(1).show()

sales2019_df.where(col("State") == "NY").orderBy("Quantity", ascending=False).limit(1).show()

sales_q4_df = (sales2019_df.where(col("State") == 'NY')
.orderBy ("OrderID", "Product" )
.groupBy ("OrderID" ,"State")
.agg(collect_list("Product").alias("ProductList")))

sales_q4_df.show(10, truncate=False)

spark.stop()

+--------------------+--------------+--------------------+--------------+-----+-------+--------+------+-------------------+-----+
|             Product|    Order Date|        StoreAddress|          City|State|OrderID|Quantity| Price|          OrderDate|Month|
+--------------------+--------------+--------------------+--------------+-----+-------+--------+------+-------------------+-----+
|USB-C Charging Cable|04/19/19 08:46|917 1st St, Dalla...|        Dallas|   TX| 176558|       2| 11.95|2019-04-19 08:46:00|    4|
|Bose SoundSport H...|04/07/19 22:30|682 Chestnut St, ...|        Boston|   MA| 176559|       1| 99.99|2019-04-07 22:30:00|    4|
|        Google Phone|04/12/19 14:38|669 Spruce St, Lo...|   Los Angeles|   CA| 176560|       1| 600.0|2019-04-12 14:38:00|    4|
|    Wired Headphones|04/12/19 14:38|669 Spruce St, Lo...|   Los Angeles|   CA| 176560|       1| 11.99|2019-04-12 14:38:00|    4|
|    Wired Headphones|04/30/19 09:27|333 8th St, Los A...|   Los Angeles|   CA| 176561|   