Initialise The PySpark Environment

In [None]:
# Create and initialise a PySpark session using the PySpark API
import os
from pyspark.sql import SparkSession # type: ignore
from pyspark.sql.types import StringType,IntegerType,StructType,StructField # type: ignore
from pyspark.sql.functions import * # type: ignore
spark = SparkSession.builder.appName("MyPySparkAutomatic").getOrCreate()

# Your PySpark session is now created and you can use it to read data, perform transformations, and write data.
print("SparkSession created successfully!")




Read The CSV Data(DataFrame: flight_df_1)

In [None]:
# Read the CSV file into a Spark DataFrame
file_path = r"C:\Users\Shivam Gupta\OneDrive\Documents\Shivam_Developement\PYTHON\python_tutorial\2015-summary.csv"
flight_df_1 = spark.read.format("csv") \
	.option("header", "true") \
	.option("inferSchema", "true") \
	.option("mode", "PERMISSIVE") \
	.load(file_path)

# Show the DataFrame
flight_df_1.show(n=flight_df_1.count(), truncate=False)

#Show the total number of rows in the DataFrame
print("Total number of rows in the DataFrame:", flight_df_1.count())

# Print the schema of the DataFrame
flight_df_1.printSchema()

Create The Manual Schema(DataFrame: flight_df_2)

In [None]:

# File paths
file_path_0 = r"C:\Users\Shivam Gupta\OneDrive\Documents\Shivam_Developement\PYTHON\python_tutorial"
file_path_1 = os.path.join(file_path_0, "2015-summary.csv")
file_path_2 = os.path.join(file_path_0, "bad_records")

# Create a Manual schema for the DataFrame
my_schema = StructType([StructField("COUNTRY_1", StringType(), True),
                        StructField("COUNTRY_2", StringType(), True),
                        StructField("TOTAL_COUNT", IntegerType(), True),
                        StructField("_corrupt_record", StringType(), True)
                        ])
                        
# Read the CSV file into a Spark DataFrame with the manual schema
# Read CSV with schema and capture bad records
flight_df_2 = spark.read.format("csv") \
    .option("header", "true") \
    .schema(my_schema) \
    .option("mode", "PERMISSIVE") \
    .load(file_path_1)

#Shows the dataframe with bad records
flight_df_2.show(n=flight_df_2.count(), truncate=False)

# Filter and write bad records manually
bad_df = flight_df_2.filter("`_corrupt_record` IS NOT NULL")
bad_df.write.mode("overwrite").json(file_path_2)


# List the contents of the directory
import os
print("Contents of the bad records directory:")
if os.path.exists(file_path_2):
    for item in os.listdir(file_path_2):
        print(item)
else:
    print("No bad records directory found.")



How to read json file in pyspark(DataFrame: people_df)

In [None]:
# File paths
file_path_0 = r"C:\Users\Shivam Gupta\OneDrive\Documents\Shivam_Developement\PYTHON\python_tutorial"
file_path_1 = os.path.join(file_path_0, "2015-summary.csv")
file_path_2 = os.path.join(file_path_0, "bad_records")
file_path_3 = os.path.join(file_path_0, "multiline.json")

people_df=spark.read.format("json") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .option("multiline", "true") \
    .option("mode", "PERMISSIVE") \
    .load(file_path_3)
# Show the DataFrame
people_df.show(n=people_df.count(), truncate=False)
# Show the total number of rows in the DataFrame
people_df.count()



How to read Parquet file in pyspark(DataFrame: people_df_2)

In [None]:
file_path_0 = r"C:\Users\Shivam Gupta\OneDrive\Documents\Shivam_Developement\PYTHON\python_tutorial"
file_path_4 = os.path.join(file_path_0, "part-r-00000-1a9822ba-b8fb-4d8e-844a-ea30d0801b9e.gz.parquet")

parquet_df = spark.read.format("parquet") \
    .load(file_path_4)
# Show the DataFrame
parquet_df.show(n=parquet_df.count(), truncate=False)
# Show the total number of rows in the DataFrame
print("Total number of rows in the DataFrame:", parquet_df.count())


Finding Metadata Information In Parquet File

In [None]:
########################################################################################################################################################
import pandas as pd
import pyarrow.parquet as pq

# Path to your .parquet file
file_path = r"C:\Users\Shivam Gupta\OneDrive\Documents\Shivam_Developement\PYTHON\python_tutorial\part-r-00000-1a9822ba-b8fb-4d8e-844a-ea30d0801b9e.gz.parquet"

try:
    # Read the whole Parquet file
    df = pd.read_parquet(file_path, engine="pyarrow")

    # ✅ Display all rows and columns
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        print(df)

    # 🔽 Optionally, export to CSV for easier viewing
    #output_csv = file_path.replace(".parquet", ".csv")
    #df.to_csv(output_csv, index=False)
    #print(f"\n✅ Full data exported to: {output_csv}")

except Exception as e:
    print("❌ Error reading Parquet file:")
    print(e)



# Load the Parquet file
parquet_file = pq.ParquetFile(
    r"C:\Users\Shivam Gupta\OneDrive\Documents\Shivam_Developement\PYTHON\python_tutorial\part-r-00000-1a9822ba-b8fb-4d8e-844a-ea30d0801b9e.gz.parquet"
)

print("File metadata:")
print(parquet_file.metadata)

print("\nFirst row group metadata:")
print(parquet_file.metadata.row_group(0))

print("\nFirst column in first row group:")
print(parquet_file.metadata.row_group(0).column(0))

print("\nColumn statistics:")
print(parquet_file.metadata.row_group(0).column(0).statistics)



How To Write Dataframe on disk(dataframe:parquet_df)

In [None]:
file_path_0 = r"C:\Users\Shivam Gupta\OneDrive\Documents\Shivam_Developement\PYTHON\python_tutorial"
file_path_4 = os.path.join(file_path_0, "part-r-00000-1a9822ba-b8fb-4d8e-844a-ea30d0801b9e.gz.parquet")
# Example: Write the parquet_df DataFrame to Parquet format
parquet_df.repartition(2).write.format("csv") \
    .mode("overwrite") \
    .option("header", "true") \
    .save(os.path.join(file_path_0, "parquet_df_write_repartioned_2"))


Implementing Partitioning & Bucketing in Pyspark

In [None]:
# Write the DataFrame to a CSV file Using Partitioning
parquet_df.write.format("csv") \
    .mode("overwrite") \
    .option("header", "true") \
    .partitionBy("DEST_COUNTRY_NAME") \
    .save(os.path.join(file_path_0, "parquet_df_write_partition_by_Example"))

# Write the DataFrame to a Parquet table Using Bucketing (CSV does not support bucketing)
"""
parquet_df.write.format("csv") \
    .mode("overwrite") \
    .option("header", "true") \
    .bucketBy(2, "") \
    .saveAsTable("parquet_df_write_bucket_by_Example")
"""

Tranformation in PySpark:How To Create Dataframe API
             


In [None]:
#################################################DATA ENGINEERING PIPLINE######################################################################
# READ----------------------------------------TRANSFORM----------------------------------------WRITE#
                                #DataFrame API--------------------SPARK SQL#

##Create a DataFrame using the DataFrame API for pe4rforming transformations in PySpark
# Create data for dataframe
data = [(1, 1),(2, 1),(3, 1),(4, 2),(5, 1),(6, 2),(7, 2)]

# Create a schema DataFrame
columns = ["id", "num"]
# Create a DataFrame using the data and schema
example_df = spark.createDataFrame(data, columns)
# Show the DataFrame
example_df.show()
example_df.printSchema()
print(example_df.columns) #it is an attriburt not callable function in pyspark
example_df.count()  


Transformation in PySpark:Using Select Method

In [None]:
file_path = r"C:\Users\Shivam Gupta\OneDrive\Documents\Shivam_Developement\PYTHON\python_tutorial\employee_data.csv"
employee_df = spark.read.format("csv") \
	.option("header", "true") \
	.option("inferSchema", "true") \
	.option("mode", "PERMISSIVE") \
	.load(file_path)

# Create a Transformation using the DataFrame API & storing in another DataFrame as variable
employee_df_1 = employee_df.select("id", "salary", (col("id") + 5).alias("id_plus_5"), employee_df.gender, employee_df["address"])
employee_df_2 = employee_df.select(expr("id+5").alias("id_plus_5"), expr("salary*2").alias("salary_times_2"), expr("concat(name, address)").alias("name_address"))

#show the DataFrame
employee_df_1.show(truncate=False)
employee_df_2.show(truncate=False)





Transformation in PySpark Using Spark SQL:Query using the select statement 

In [None]:
#Crating a temporary view for the DataFrame
employee_df.createOrReplaceTempView("employee_tbl")
# Create a SQL query to select the desired columns
employee_tbl_1=spark.sql("""select * from employee_tbl where salary > 70000""")
employee_tbl_1.show(truncate=False)

Transformation in Pyspark: Using Filter,Aliases,Literal,Casting,etc

In [18]:
#Alises the DataFrame
employee_df_4 = employee_df.select("id", "salary", (col("id") + 5).alias("id_plus_5"), employee_df.gender, employee_df["address"])

# Filtering the DataFrame using the DataFrame API
employee_df_5 = employee_df.filter(col("address") == "JAPAN")
employee_df_6 = employee_df.filter((col("address") == "JAPAN") & (col("salary") > 70000)) \
    .select("id", "salary", (col("id") + 5).alias("id_plus_5"))
employee_df_7 = employee_df.select("id", "salary", (col("id") + 5).alias("id_plus_5")).where("address = 'JAPAN' and salary > 70000")

#Literal function used to create a column with a constant value
employee_df_8 = employee_df.select("*", lit("Gupta").alias("last_name"))
employee_df_9= employee_df.withColumn("last_name", lit("Gupta"))

#Renaming the columns
employee_df_10 = (
    employee_df.withColumnRenamed("id", "emp_id")
    .withColumnRenamed("salary", "emp_salary")
    .withColumnRenamed("address", "emp_address")
    .withColumnRenamed("gender", "emp_gender")
    .withColumnRenamed("name", "emp_name")
    .withColumnRenamed("last_name", "emp_last_name")
    .withColumnRenamed("age", "emp_age")
)

#Casting the column
employee_df_11 = employee_df.withColumn("id", col("id").cast(StringType())).withColumn("salary", col("salary").cast("long"))

#Dropping the column
employee_df_12 = employee_df.drop("last_name", "age", "address", "gender", "name", "id",)

# Show the DataFrame
employee_df_4.show(truncate=False)
employee_df_5.show(truncate=False)
employee_df_6.show(truncate=False)
employee_df_7.show(truncate=False)
employee_df_8.show(truncate=False)
employee_df_9.show(truncate=False)
employee_df_10.show(truncate=False)
employee_df_11.printSchema()
employee_df_12.show(truncate=False)
employee_df.show(truncate=False)


+---+------+---------+------+-------+
|id |salary|id_plus_5|gender|address|
+---+------+---------+------+-------+
|1  |75000 |6        |m     |INDIA  |
|2  |100000|7        |f     |USA    |
|3  |150000|8        |m     |INDIA  |
|4  |200000|9        |m     |JAPAN  |
|5  |300000|10       |m     |USA    |
|6  |300000|11       |m     |INDIA  |
|7  |540000|12       |m     |USA    |
|8  |70000 |13       |m     |JAPAN  |
|9  |150000|14       |m     |JAPAN  |
|10 |25000 |15       |f     |RUSSIA |
|11 |35000 |16       |f     |INDIA  |
|12 |200000|17       |f     |INDIA  |
|13 |650000|18       |m     |USA    |
|14 |95000 |19       |m     |RUSSIA |
|15 |750000|20       |m     |INDIA  |
+---+------+---------+------+-------+

+---+--------+---+------+-------+------+
|id |name    |age|salary|address|gender|
+---+--------+---+------+-------+------+
|4  |Prantosh|17 |200000|JAPAN  |m     |
|8  |Praveen |28 |70000 |JAPAN  |m     |
|9  |Dev     |32 |150000|JAPAN  |m     |
+---+--------+---+------+------

Transformation in Pyspark: Union & Union All(Same in Datafreme API But Different in Spark SQL)

In [None]:
#Create a Data for manager1
data=[(10 ,'Anil',50000, 18),
(11 ,'Vikas',75000,  16),
(12 ,'Nisha',40000,  18),
(13 ,'Nidhi',60000,  17),
(14 ,'Priya',80000,  18),
(15 ,'Mohit',45000,  18),
(16 ,'Rajesh',90000, 10),
(17 ,'Raman',55000, 16),
(18 ,'Sam',65000,   17),
(18 ,'Sam',65000,   17)]
# Create a schema for the DataFrame
schema=['id', 'name', 'sal', 'mngr_id']
# Create a DataFrame using the DataFrame API
manager_df_1 = spark.createDataFrame(data, schema)
# Show the DataFrame
manager_df_1.show(truncate=False)
#show the schema of the DataFrame
manager_df_1.printSchema()
#show the total number of rows in the DataFrame
print("Total number of rows in the DataFrame:", manager_df_1.count())





#create data for manager2
data1=[(19 ,'Sohan',50000, 18),
(20 ,'Sima',75000,  17)]
# Create a schema for the DataFrame
schema1=['id', 'name', 'sal', 'mngr_id']
# Create a DataFrame using the DataFrame API
manager_df_2 = spark.createDataFrame(data1, schema1)
# Show the DataFrame
manager_df_2.show(truncate=False)
#show the schema of the DataFrame
manager_df_2.printSchema()
#show the total number of rows in the DataFrame
print("Total number of rows in the DataFrame:", manager_df_2.count())




#Union of two DataFrames
manager_df_union = manager_df_1.union(manager_df_2)
manager_df_unionAll= manager_df_1.unionAll(manager_df_2)
manager_df_unionByName= manager_df_1.unionByName(manager_df_2)

# Show the DataFrame
manager_df_union.show(truncate=False)
print("Total number of rows in the DataFrame:", manager_df_union.count())
manager_df_unionAll.show(truncate=False)
print("Total number of rows in the DataFrame:", manager_df_unionAll.count())
manager_df_unionByName.show(truncate=False)
print("Total number of rows in the DataFrame:", manager_df_unionByName.count())




Transformation in Pyspark: Case(if-else comaprison using when/otherwise)

In [21]:

# Create data for DataFrame

emp_data = [
(1,'manish',26,20000,'india','IT'),
(2,'rahul',None,40000,'germany','engineering'),
(3,'pawan',12,60000,'india','sales'),
(4,'roshini',44,None,'uk','engineering'),
(5,'raushan',35,70000,'india','sales'),
(6,None,29,200000,'uk','IT'),
(7,'adam',37,65000,'us','IT'),
(8,'chris',16,40000,'us','sales'),
(None,None,None,None,None,None),
(7,'adam',37,65000,'us','IT')
]
# Create a schema for the DataFrame
schema = ['id', 'name', 'age', 'salary', 'address', 'department']
# Create a DataFrame using the DataFrame API
emp_df= spark.createDataFrame(emp_data, schema)
# Show the DataFrame
emp_df.show(truncate=False)
#show the schema of the DataFrame
emp_df.printSchema()
#show the total number of rows in the DataFrame
print("Total number of rows in the DataFrame:", emp_df.count())



#Checking the Age of the employee if they are adult or not(otherwise).Assuming emp_df is your original DataFrame
emp_df_1 = emp_df.withColumn(
    "is_adult",
    when(col("age").isNull(), None)           # If age is null → null
    .when(col("age") > 18, "Yes")             # If age > 18 → "Yes"
    .otherwise("No")                          # Otherwise → "No"
)


emp_df_2 = emp_df.withColumn(
    "is_adult",
    when((col("age")>0) &(col("age")<18), "minor")          
    .when((col("age")>18) &(col("age")<30), "medium")                        
    .otherwise("major")                          
)

# Show the DataFrame
emp_df_1.show(truncate=False)
emp_df_2.show(truncate=False)






+----+-------+----+------+-------+-----------+
|id  |name   |age |salary|address|department |
+----+-------+----+------+-------+-----------+
|1   |manish |26  |20000 |india  |IT         |
|2   |rahul  |NULL|40000 |germany|engineering|
|3   |pawan  |12  |60000 |india  |sales      |
|4   |roshini|44  |NULL  |uk     |engineering|
|5   |raushan|35  |70000 |india  |sales      |
|6   |NULL   |29  |200000|uk     |IT         |
|7   |adam   |37  |65000 |us     |IT         |
|8   |chris  |16  |40000 |us     |sales      |
|NULL|NULL   |NULL|NULL  |NULL   |NULL       |
|7   |adam   |37  |65000 |us     |IT         |
+----+-------+----+------+-------+-----------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- salary: long (nullable = true)
 |-- address: string (nullable = true)
 |-- department: string (nullable = true)

Total number of rows in the DataFrame: 10
+----+-------+----+------+-------+-----------+--------+
|id  |name   |age |

Transformation in Pyspark: Case(Unique & Sorted Record in datafarame)

In [26]:
# Create data for DataFrame
data=[(10 ,'Anil',50000, 18),
(11 ,'Vikas',75000,  16),
(12 ,'Nisha',40000,  18),
(13 ,'Nidhi',60000,  17),
(14 ,'Priya',80000,  18),
(15 ,'Mohit',45000,  18),
(16 ,'Rajesh',90000, 10),
(17 ,'Raman',55000, 16),
(18 ,'Sam',65000,   17),
(15 ,'Mohit',45000,  18),
(13 ,'Nidhi',60000,  17),      
(14 ,'Priya',90000,  18),  
(18 ,'Sam',65000,   17)
]
# Create a schema for the DataFrame
schema = ['id', 'name', 'sal', 'mngr_id']
# Create a DataFrame using the DataFrame API
mngr_df = spark.createDataFrame(data, schema)
# Show the DataFrame
mngr_df.show(truncate=False)
#show the schema of the DataFrame
mngr_df.printSchema()
#show the total number of rows in the DataFrame
print("Total number of rows in the DataFrame:", mngr_df.count())



# Finding unique records & deleting/droping duplicates in the DataFrame
mngr_df_1 = mngr_df.distinct()
mngr_df_2 = mngr_df.select("id", "name").distinct() #selecting distinct records from dataframe created using the id & name columns
mngr_df_3 = mngr_df.dropDuplicates(["id", "name", "sal", "mngr_id"]) #droping duplicates from the DataFrame using the id & name columns


#sorting the DataFrame
mngr_df_4 = mngr_df_1.sort(col("sal").desc(),col("name").asc()) #sorting the DataFrame using the sal column



#show the schema of the DataFrame
mngr_df_1.show(truncate=False)
mngr_df_2.show(truncate=False)
mngr_df_3.show(truncate=False)
mngr_df_4.show(truncate=False)








+---+------+-----+-------+
|id |name  |sal  |mngr_id|
+---+------+-----+-------+
|10 |Anil  |50000|18     |
|11 |Vikas |75000|16     |
|12 |Nisha |40000|18     |
|13 |Nidhi |60000|17     |
|14 |Priya |80000|18     |
|15 |Mohit |45000|18     |
|16 |Rajesh|90000|10     |
|17 |Raman |55000|16     |
|18 |Sam   |65000|17     |
|15 |Mohit |45000|18     |
|13 |Nidhi |60000|17     |
|14 |Priya |90000|18     |
|18 |Sam   |65000|17     |
+---+------+-----+-------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- sal: long (nullable = true)
 |-- mngr_id: long (nullable = true)

Total number of rows in the DataFrame: 13
+---+------+-----+-------+
|id |name  |sal  |mngr_id|
+---+------+-----+-------+
|10 |Anil  |50000|18     |
|11 |Vikas |75000|16     |
|12 |Nisha |40000|18     |
|13 |Nidhi |60000|17     |
|14 |Priya |80000|18     |
|15 |Mohit |45000|18     |
|16 |Rajesh|90000|10     |
|17 |Raman |55000|16     |
|18 |Sam   |65000|17     |
|14 |Priya |90000|18     |
+--

Transformation in Pyspark: Aggregate function

In [29]:
## Create data for DataFrame
empl_data = [
(1,'manish',26,20000,'india','IT'),
(2,'rahul',None,40000,'germany','engineering'),
(3,'pawan',12,60000,'india','sales'),
(4,'roshini',44,None,'uk','engineering'),
(5,'raushan',35,70000,'india','sales'),
(6,None,29,200000,'uk','IT'),
(7,'adam',37,65000,'us','IT'),
(8,'chris',16,40000,'us','sales'),
(None,None,None,None,None,None),
(7,'adam',37,65000,'us','IT')
]
# Create a schema for the DataFrame
schema = ['id', 'name', 'age', 'salary', 'address', 'department']
# Create a DataFrame using the DataFrame API
empl_df= spark.createDataFrame(empl_data, schema)
# Show the DataFrame
empl_df.show(truncate=False)
#show the schema of the DataFrame
empl_df.printSchema()
#show the total number of rows in the DataFrame
print("Total number of rows in the DataFrame:", empl_df.count())


#count() function is used to count the number of rows in the DataFrame
empl_df_1 = empl_df.select(count("*"))
empl_df_2 = empl_df.select(count("name")) 
empl_df_3 = empl_df.select(countDistinct("address").alias("distinct_address_count")) #counting the distinct records in the DataFrame using the address column

#min().max() and avg() function is used to find the minimum, maximum and average of the column in the DataFrame
empl_df_4 = empl_df.select(min("salary").alias("min_salary"), max("salary").alias("max_salary"), avg("salary").alias("avg_salary")) #finding the min, max and avg of the salary column in the DataFrame

#show the DataFrame
empl_df_1.show(truncate=False)
empl_df_2.show(truncate=False)
empl_df_3.show(truncate=False)
empl_df_4.show(truncate=False)


+----+-------+----+------+-------+-----------+
|id  |name   |age |salary|address|department |
+----+-------+----+------+-------+-----------+
|1   |manish |26  |20000 |india  |IT         |
|2   |rahul  |NULL|40000 |germany|engineering|
|3   |pawan  |12  |60000 |india  |sales      |
|4   |roshini|44  |NULL  |uk     |engineering|
|5   |raushan|35  |70000 |india  |sales      |
|6   |NULL   |29  |200000|uk     |IT         |
|7   |adam   |37  |65000 |us     |IT         |
|8   |chris  |16  |40000 |us     |sales      |
|NULL|NULL   |NULL|NULL  |NULL   |NULL       |
|7   |adam   |37  |65000 |us     |IT         |
+----+-------+----+------+-------+-----------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- salary: long (nullable = true)
 |-- address: string (nullable = true)
 |-- department: string (nullable = true)

Total number of rows in the DataFrame: 10
+--------+
|count(1)|
+--------+
|10      |
+--------+

+-----------+
|coun

Transformation in Pyspark: Groupby

In [None]:
#data for DataFrame
data=[(1,'manish',50000,"IT"),
(2,'vikash',60000,"sales"),
(3,'raushan',70000,"marketing"),
(4,'mukesh',80000,"IT"),
(5,'pritam',90000,"sales"),
(6,'nikita',45000,"marketing"),
(7,'ragini',55000,"marketing"),
(8,'rakesh',100000,"IT"),
(9,'aditya',65000,"IT"),
(10,'rahul',50000,"marketing")]
# Create a schema for the DataFrame
schema = ['id', 'name', 'salary', 'department']
# Create a DataFrame using the DataFrame API
dept_df = spark.createDataFrame(data, schema)
# Show the DataFrame
dept_df.show(truncate=False)
#show the schema of the DataFrame
dept_df.printSchema()
#show the total number of rows in the DataFrame
print("Total number of rows in the DataFrame:", dept_df.count())





#groupby() function is used to group the DataFrame by the department column
dept_df_2 = dept_df.groupBy("department").agg(count("*").alias("count"), avg("salary").alias("avg_salary"), min("salary").alias("min_salary"), max("salary").alias("max_salary")) #counting the number of records in the DataFrame using the department column and finding the min, max and avg of the salary column in the DataFrame

# Show the DataFrame
dept_df_2.show(truncate=False)







+---+-------+------+----------+
|id |name   |salary|department|
+---+-------+------+----------+
|1  |manish |50000 |IT        |
|2  |vikash |60000 |sales     |
|3  |raushan|70000 |marketing |
|4  |mukesh |80000 |IT        |
|5  |pritam |90000 |sales     |
|6  |nikita |45000 |marketing |
|7  |ragini |55000 |marketing |
|8  |rakesh |100000|IT        |
|9  |aditya |65000 |IT        |
|10 |rahul  |50000 |marketing |
+---+-------+------+----------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- department: string (nullable = true)

Total number of rows in the DataFrame: 10
+----------+-----+----------+----------+----------+
|department|count|avg_salary|min_salary|max_salary|
+----------+-----+----------+----------+----------+
|IT        |4    |73750.0   |50000     |100000    |
|sales     |2    |75000.0   |60000     |90000     |
|marketing |4    |55000.0   |45000     |70000     |
+----------+-----+----------+----------+-----

Transformation in Pyspark: Joins